In [9]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import f1_score
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

In [10]:
dataset_folder = "labeled"
datasets = ["06-13-2013audio_08DIR12-2218_labeled.csv", "09-07-2017 audio_6 APCNV-2016-565_labeled.csv", "09-12-2017audio_6 ZA-2017-210-CU-1A_labeled.csv", "5_DIR_2022_5446_labeled.csv", "6_ZA_2022_6776_labeled.csv", "07_CPC_2022_6189_labeled.csv"]

#combine all data into singular dataframe
#TODO: with actual deployment should directly load into dataloader
labeled_df = pd.DataFrame()

for data_file in datasets:
  file_path = dataset_folder + "/" + data_file
  data_file_df = pd.read_csv(file_path)
  labeled_df = pd.concat([labeled_df, data_file_df], ignore_index=True)

labeled_df

Unnamed: 0,speaker,text,label
0,2,"Thank you James, me, congratulations. Thank y...",
1,8,you,
2,2,That was a bad joke. We should go out to lunc...,
3,8,you We should go out. you,
4,2,"We've been waiting all day to hear from you, ...",
...,...,...,...
1082,36,Commissioner Lyshay? Yes. Commissioner Zamora?,
1083,38,you,
1084,3,Yes.,
1085,36,Commissioner Vice President Cho? Yes. And the...,


In [11]:
# relabel labels

# Replace NaN values in 'label' column with 2
labeled_df['label'] = labeled_df['label'].fillna(2)
# Drop the speaker column
labeled_df = labeled_df.drop('speaker', axis=1)
# Drop the index column
# labeled_df = labeled_df.reset_index(drop=True)



#do test train split - make sure the test set is 20/20/40
#-1 is oppose, 1 is support, NaN is neither
#want 0 oppose, 1 support, 2 neither

# Convert NaN to 2 (neither), -1 to 0 (oppose), 1 stays as 1 (support)
labeled_df['label'] = labeled_df['label'].map({-1: 0, 1: 1, 2:2})

# create test and trainh set
test_df = pd.concat([
    labeled_df[labeled_df['label'] == 0].sample(n=8, random_state=42),
    labeled_df[labeled_df['label'] == 1].sample(n=8, random_state=42), 
    labeled_df[labeled_df['label'] == 2].sample(n=30, random_state=42)
])

# train_df = labeled_df.drop(test_df.index)
# Drop test_df indices from labeled_df
temp_df = labeled_df.drop(test_df.index)
# Limit to only 100 samples with label 2, keep all samples with labels 0 and 1
label_2_samples = temp_df[temp_df['label'] == 2].sample(n=70, random_state=42)
other_samples = temp_df[temp_df['label'] != 2]
train_df = pd.concat([other_samples, label_2_samples])


test_dataset = Dataset.from_pandas(test_df)
train_dataset = Dataset.from_pandas(train_df)

split_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print("\nTest Dataset:")
print(split_dataset["test"])



Test Dataset:
Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 46
})


In [12]:
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [13]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt")\
    
if "label" in split_dataset["train"].features.keys():
    split_dataset =  split_dataset.rename_column("label", "labels") # to match Trainer
tokenized_dataset = split_dataset.map(tokenize, batched=True, remove_columns=["text", "__index_level_0__"])
 
tokenized_dataset["train"].features.keys()


Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

dict_keys(['labels', 'input_ids', 'attention_mask'])

In [14]:
# load model
model_id = "answerdotai/ModernBERT-base"
 
labels = tokenized_dataset["train"].features["labels"]
num_labels = 3  # Binary classification based on the data
label2id = {"0": "0", "1": "1", "2":"2"}
id2label = {"0": "0", "1": "1", "2": "2"}

model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    score = f1_score(
            labels, predictions, labels=labels, pos_label=1, average="weighted"
        )
    return {"f1": float(score) if score == 1 else score}

In [None]:
# training args for Mx metal series macbook
# training_args = TrainingArguments(
#     output_dir= "ModernBERT-domain-classifier",
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=16,
#     learning_rate=5e-5,
#         num_train_epochs=5,
#     bf16=True, # bfloat16 training 
#     optim="adamw_torch_fused", # improved optimizer 
#     # logging & evaluation strategies
#     logging_strategy="steps",
#     logging_steps=100,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     use_mps_device=True,
#     metric_for_best_model="f1",
#     # push to hub parameters
#     push_to_hub=True,
#     hub_strategy="every_save",
#     hub_token=HfFolder.get_token(),
# )

# training args for Intel silicon series macbook
training_args = TrainingArguments(
    output_dir= "ModernBERT-domain-classifier",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    bf16=False,
    optim="adamw_hf",  # Changed to standard optimizer
    # logging & evaluation strategies
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    # push to hub parameters
    push_to_hub=True,
    hub_strategy="every_save",
    hub_token=HfFolder.get_token(),
)
 
# trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)
trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
#forward inference
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = AutoModelForSequenceClassification.from_pretrained("mpham8/ModernBERT-domain-classifier")

classifier = pipeline(
    task="text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0
)

In [None]:
test_texts = test_df["text"].tolist() 
predictions = classifier(test_texts)
print(predictions)

# Get true labels from test set
truelabels_ls = test_df["label"].tolist()
predlabels_ls = [int(pred["label"][-1]) for pred in predictions]  # Extracts the label number from "LABEL_X"


In [None]:
accuracy = accuracy_score(truelabels_ls, predlabels_ls)
precision = precision_score(truelabels_ls, predlabels_ls, average='weighted')
recall = recall_score(truelabels_ls, predlabels_ls, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}") 
print(f"Recall: {recall:.2f}")

report = classification_report(truelabels_ls, predlabels_ls, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df = report_df.round(2) 

report_df


In [None]:
# Get counts of each label in labeled_df
label_counts = labeled_df['label'].value_counts().sort_index()
print("\nLabel distribution in labeled data:")
for label, count in label_counts.items():
    print(f"Label {label}: {count}")
