In [31]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import f1_score
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import torch
from transformers import default_data_collator


In [32]:
dataset_folder = "labeled"
datasets = ["06-13-2013audio_08DIR12-2218_labeled.csv", "09-07-2017 audio_6 APCNV-2016-565_labeled.csv", "09-12-2017audio_6 ZA-2017-210-CU-1A_labeled.csv", "5_DIR_2022_5446_labeled.csv", "6_ZA_2022_6776_labeled.csv", "07_CPC_2022_6189_labeled.csv"]

#combine all data into singular dataframe
#TODO: with actual deployment should directly load into dataloader
labeled_df = pd.DataFrame()

for data_file in datasets:
  file_path = dataset_folder + "/" + data_file
  data_file_df = pd.read_csv(file_path)
  labeled_df = pd.concat([labeled_df, data_file_df], ignore_index=True)

labeled_df

Unnamed: 0,speaker,text,label
0,2,"Thank you James, me, congratulations. Thank y...",
1,8,you,
2,2,That was a bad joke. We should go out to lunc...,
3,8,you We should go out. you,
4,2,"We've been waiting all day to hear from you, ...",
...,...,...,...
1082,36,Commissioner Lyshay? Yes. Commissioner Zamora?,
1083,38,you,
1084,3,Yes.,
1085,36,Commissioner Vice President Cho? Yes. And the...,


In [33]:
# relabel labels

# Replace NaN values in 'label' column with 2
labeled_df['label'] = labeled_df['label'].fillna(2)
# Drop the speaker column
labeled_df = labeled_df.drop('speaker', axis=1)
# Drop the index column
# labeled_df = labeled_df.reset_index(drop=True)



#do test train split - make sure the test set is 20/20/40
#-1 is oppose, 1 is support, NaN is neither
#want 0 oppose, 1 support, 2 neither

# Convert NaN to 2 (neither), -1 to 0 (oppose), 1 stays as 1 (support)
labeled_df['label'] = labeled_df['label'].map({-1: 0, 1: 1, 2:2})

# create test and trainh set
test_df = pd.concat([
    labeled_df[labeled_df['label'] == 0].sample(n=8, random_state=42),
    labeled_df[labeled_df['label'] == 1].sample(n=8, random_state=42), 
    labeled_df[labeled_df['label'] == 2].sample(n=30, random_state=42)
])

# train_df = labeled_df.drop(test_df.index)
# Drop test_df indices from labeled_df
temp_df = labeled_df.drop(test_df.index)
# Limit to only 100 samples with label 2, keep all samples with labels 0 and 1
label_2_samples = temp_df[temp_df['label'] == 2].sample(n=70, random_state=42)
other_samples = temp_df[temp_df['label'] != 2]
train_df = pd.concat([other_samples, label_2_samples])


test_dataset = Dataset.from_pandas(test_df)
train_dataset = Dataset.from_pandas(train_df)

split_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print("\nTest Dataset:")
print(split_dataset["test"])



Test Dataset:
Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 46
})


In [34]:
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [35]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt")\
    
if "label" in split_dataset["train"].features.keys():
    split_dataset =  split_dataset.rename_column("label", "labels") # to match Trainer
tokenized_dataset = split_dataset.map(tokenize, batched=True, remove_columns=["text", "__index_level_0__"])
 
tokenized_dataset["train"].features.keys()


Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

dict_keys(['labels', 'input_ids', 'attention_mask'])

In [36]:
# load model
model_id = "answerdotai/ModernBERT-base"
 
labels = tokenized_dataset["train"].features["labels"]
num_labels = 3  # Binary classification based on the data
label2id = {"0": "0", "1": "1", "2":"2"}
id2label = {"0": "0", "1": "1", "2": "2"}

model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    score = f1_score(
            labels, predictions, labels=labels, pos_label=1, average="weighted"
        )
    return {"f1": float(score) if score == 1 else score}

In [None]:
# training args for Mx metal series macbook
# training_args = TrainingArguments(
#     output_dir= "ModernBERT-domain-classifier",
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=16,
#     learning_rate=5e-5,
#         num_train_epochs=5,
#     bf16=True, # bfloat16 training 
#     optim="adamw_torch_fused", # improved optimizer 
#     # logging & evaluation strategies
#     logging_strategy="steps",
#     logging_steps=100,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     use_mps_device=True,
#     metric_for_best_model="f1",
#     # push to hub parameters
#     push_to_hub=True,
#     hub_strategy="every_save",
#     hub_token=HfFolder.get_token(),
# )

# training args for Intel silicon series macbook
training_args = TrainingArguments(
    output_dir= "ModernBERT-support-oppose",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    bf16=False,
    optim="adamw_hf",  # Changed to standard optimizer
    # logging & evaluation strategies
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    # push to hub parameters
    push_to_hub=True,
    hub_strategy="every_save",
    hub_token=HfFolder.get_token(),
)
 
# Define class weights to give more weight to label 0
class_weights = torch.tensor([3.0, 1.0, 1.0])  # Higher weight for class 0
# Define a custom loss function that applies class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Apply class weights to the loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

# trainer instance
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    data_collator=default_data_collator
)
trainer.train()

In [17]:
#forward inference
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = AutoModelForSequenceClassification.from_pretrained("mpham8/ModernBERT-support-oppose")

classifier = pipeline(
    task="text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0
)



model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
test_texts = test_df["text"].tolist() 
predictions = classifier(test_texts)
print(predictions)

# Get true labels from test set
truelabels_ls = test_df["label"].tolist()
predlabels_ls = [int(pred["label"][-1]) for pred in predictions]  # Extracts the label number from "LABEL_X"



[{'label': '1', 'score': 0.8757565021514893}, {'label': '2', 'score': 0.48573312163352966}, {'label': '2', 'score': 0.7130175232887268}, {'label': '0', 'score': 0.389382541179657}, {'label': '1', 'score': 0.9823958873748779}, {'label': '1', 'score': 0.8701786398887634}, {'label': '1', 'score': 0.926503598690033}, {'label': '2', 'score': 0.5192477703094482}, {'label': '1', 'score': 0.9823731184005737}, {'label': '1', 'score': 0.9795883893966675}, {'label': '1', 'score': 0.9969384670257568}, {'label': '1', 'score': 0.5419200658798218}, {'label': '1', 'score': 0.9572132229804993}, {'label': '1', 'score': 0.9849019646644592}, {'label': '1', 'score': 0.9743121266365051}, {'label': '1', 'score': 0.6541351079940796}, {'label': '2', 'score': 0.9999268054962158}, {'label': '2', 'score': 0.9995272159576416}, {'label': '2', 'score': 0.999311089515686}, {'label': '2', 'score': 0.9999998807907104}, {'label': '2', 'score': 0.9994910955429077}, {'label': '2', 'score': 0.9800618290901184}, {'label': '

In [None]:
predictions = classifier(test_texts, top_k=None)


# Create a DataFrame to display predictions with probabilities
prediction_results = []

for i, (text, true_label, prediction) in enumerate(zip(test_texts, truelabels_ls, predictions)):
    # Extract the prediction details
    predicted_label = prediction[0]["label"] if isinstance(prediction, list) else int(prediction["label"][-1])
    
    # Create a dictionary with all required information
    result = {
        "comment": text,
        "true_label": true_label,
        "predicted_label": predicted_label,
        "score_0": prediction[0]["score"] if isinstance(prediction, list) else prediction["scores"][0],  # Score for label 0
        "score_1": prediction[1]["score"] if isinstance(prediction, list) else prediction["scores"][1],  # Score for label 1
        "score_2": prediction[2]["score"] if isinstance(prediction, list) else prediction["scores"][2]   # Score for label 2
    }
    prediction_results.append(result)

# Create DataFrame from the results
predictions_df = pd.DataFrame(prediction_results)

# Display the DataFrame
print("\nPrediction Results:")
display(predictions_df)

# Export predictions DataFrame to CSV
predictions_df.to_csv('model_predictions.csv', index=False)
print(f"Predictions exported to 'model_predictions.csv'")




Prediction Results:


Unnamed: 0,comment,true_label,predicted_label,score_0,score_1,score_2
0,Still the Blackstone retired certified Arbor....,0,1,0.875757,0.08899634,0.03524717
1,"thank you, Madam President. My name is Marco...",0,2,0.485733,0.3773045,0.1369624
2,went to you Let's go to four. Good afternoo...,0,2,0.713018,0.2748094,0.01217302
3,"The issue here was the failure to disclose, u...",0,0,0.389383,0.3521663,0.2584512
4,"Hi, my name is Paul Corona. I am a resident o...",0,1,0.982396,0.01436947,0.003234584
5,you Thank you. Thank you very much Presiden...,0,1,0.870179,0.08082499,0.0489964
6,No notices that we are aware of were given to...,0,1,0.926504,0.04496961,0.0285268
7,This is Henry Choo City Planning. So from the...,0,2,0.519248,0.4005715,0.08018072
8,"Hi, I'm a pilot in the pilot's pilot. I've be...",1,1,0.982373,0.01185039,0.00577645
9,"Hi, this is Jane Davins. I'm a can you hear m...",1,1,0.979588,0.0149339,0.005477784


Predictions exported to 'model_predictions.csv'


In [30]:
predictions

[[{'label': '1', 'score': 0.8757565021514893},
  {'label': '2', 'score': 0.08899633586406708},
  {'label': '0', 'score': 0.03524717316031456}],
 [{'label': '2', 'score': 0.48573312163352966},
  {'label': '1', 'score': 0.3773044943809509},
  {'label': '0', 'score': 0.1369624137878418}],
 [{'label': '2', 'score': 0.7130175232887268},
  {'label': '1', 'score': 0.27480944991111755},
  {'label': '0', 'score': 0.012173015624284744}],
 [{'label': '0', 'score': 0.389382541179657},
  {'label': '1', 'score': 0.3521663248538971},
  {'label': '2', 'score': 0.2584511935710907}],
 [{'label': '1', 'score': 0.9823958873748779},
  {'label': '0', 'score': 0.014369473792612553},
  {'label': '2', 'score': 0.003234584117308259}],
 [{'label': '1', 'score': 0.8701786398887634},
  {'label': '0', 'score': 0.08082498610019684},
  {'label': '2', 'score': 0.048996396362781525}],
 [{'label': '1', 'score': 0.926503598690033},
  {'label': '2', 'score': 0.04496961086988449},
  {'label': '0', 'score': 0.02852679602801

In [19]:
accuracy = accuracy_score(truelabels_ls, predlabels_ls)
precision = precision_score(truelabels_ls, predlabels_ls, average='weighted')
recall = recall_score(truelabels_ls, predlabels_ls, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}") 
print(f"Recall: {recall:.2f}")

report = classification_report(truelabels_ls, predlabels_ls, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df = report_df.round(2) 

report_df


Accuracy: 0.85
Precision: 0.88
Recall: 0.85


Unnamed: 0,precision,recall,f1-score,support
0,1.0,0.12,0.22,8.0
1,0.67,1.0,0.8,8.0
2,0.91,1.0,0.95,30.0
accuracy,0.85,0.85,0.85,0.85
macro avg,0.86,0.71,0.66,46.0
weighted avg,0.88,0.85,0.8,46.0


In [20]:
# Get counts of each label in labeled_df
label_counts = labeled_df['label'].value_counts().sort_index()
print("\nLabel distribution in labeled data:")
for label, count in label_counts.items():
    print(f"Label {label}: {count}")



Label distribution in labeled data:
Label 0: 23
Label 1: 43
Label 2: 1021
