In [1]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import f1_score
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

In [2]:
dataset_folder = "labeled"
datasets = ["06-13-2013audio_08DIR12-2218_labeled.csv", "09-07-2017 audio_6 APCNV-2016-565_labeled.csv", "09-12-2017audio_6 ZA-2017-210-CU-1A_labeled.csv"]

#combine all data into singular dataframe
#TODO: with actual deployment should directly load into dataloader
labeled_df = pd.DataFrame()

for data_file in datasets:
  file_path = dataset_folder + "/" + data_file
  data_file_df = pd.read_csv(file_path)
  labeled_df = pd.concat([labeled_df, data_file_df], ignore_index=True)

labeled_df

Unnamed: 0,speaker,text,label
0,2,"Thank you James, me, congratulations. Thank y...",
1,8,you,
2,2,That was a bad joke. We should go out to lunc...,
3,8,you We should go out. you,
4,2,"We've been waiting all day to hear from you, ...",
...,...,...,...
305,4,you,
306,8,you,
307,4,Thank you.,
308,8,"Commissioner Chun Kim, so we're going to wait...",


In [3]:
# relabel labels

# Replace NaN values in 'label' column with 2
labeled_df['label'] = labeled_df['label'].fillna(2)
# Drop the speaker column
labeled_df = labeled_df.drop('speaker', axis=1)
# Drop the index column
# labeled_df = labeled_df.reset_index(drop=True)



#do test train split - make sure the test set is 20/20/40
#-1 is oppose, 1 is support, NaN is neither
#want 0 oppose, 1 support, 2 neither

# Convert NaN to 2 (neither), -1 to 0 (oppose), 1 stays as 1 (support)
labeled_df['label'] = labeled_df['label'].map({-1: 0, 1: 1, 2:2})

# create test and trainh set
test_df = pd.concat([
    labeled_df[labeled_df['label'] == 0].sample(n=3, random_state=42),
    labeled_df[labeled_df['label'] == 1].sample(n=3, random_state=42), 
    labeled_df[labeled_df['label'] == 2].sample(n=10, random_state=42)
])

train_df = labeled_df.drop(test_df.index)


test_dataset = Dataset.from_pandas(test_df)
train_dataset = Dataset.from_pandas(train_df)

split_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print("\nTest Dataset:")
print(split_dataset["test"])



Test Dataset:
Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 16
})


In [5]:
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [7]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt")\
    
if "label" in split_dataset["train"].features.keys():
    split_dataset =  split_dataset.rename_column("label", "labels") # to match Trainer
tokenized_dataset = split_dataset.map(tokenize, batched=True, remove_columns=["text", "__index_level_0__"])
 
tokenized_dataset["train"].features.keys()


Map:   0%|          | 0/294 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

dict_keys(['labels', 'input_ids', 'attention_mask'])

In [8]:
# load model
model_id = "answerdotai/ModernBERT-base"
 
labels = tokenized_dataset["train"].features["labels"]
num_labels = 3  # Binary classification based on the data
label2id = {"0": "0", "1": "1", "2":"2"}
id2label = {"0": "0", "1": "1", "2": "2"}

model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [9]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    score = f1_score(
            labels, predictions, labels=labels, pos_label=1, average="weighted"
        )
    return {"f1": float(score) if score == 1 else score}

In [12]:
# training args for Mx metal series macbook
# training_args = TrainingArguments(
#     output_dir= "ModernBERT-domain-classifier",
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=16,
#     learning_rate=5e-5,
#         num_train_epochs=5,
#     bf16=True, # bfloat16 training 
#     optim="adamw_torch_fused", # improved optimizer 
#     # logging & evaluation strategies
#     logging_strategy="steps",
#     logging_steps=100,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     use_mps_device=True,
#     metric_for_best_model="f1",
#     # push to hub parameters
#     push_to_hub=True,
#     hub_strategy="every_save",
#     hub_token=HfFolder.get_token(),
# )

# training args for Intel silicon series macbook
training_args = TrainingArguments(
    output_dir= "ModernBERT-domain-classifier",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=5,
    bf16=False,
    optim="adamw_hf",  # Changed to standard optimizer
    # logging & evaluation strategies
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    # push to hub parameters
    push_to_hub=True,
    hub_strategy="every_save",
    hub_token=HfFolder.get_token(),
)
 
# trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,No log,1.275486,0.716102
2,No log,0.785093,0.716102
3,No log,1.166787,0.65189
4,No log,0.904066,0.767232
5,No log,0.940097,0.767232


'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/36/af/36af92bf533b205e32a4a4e924649285a3d2a9abb75393b6eb34158ac8fade73/30b4b4dbc49966bbea6192556e98b0eaa796e1655c75e9e5ff6c0f9dba09f27a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20250210%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250210T235603Z&X-Amz-Expires=86400&X-Amz-Signature=02791d90d2569febe3e8d4e02acfb1ae955da4a6e0d33fbebbff043c36713b60&X-Amz-SignedHeaders=host&partNumber=1&uploadId=4YLKVLZRQCI_n0TNeBOsaAQsnMtwzCBAsJPWYSt2u97d6DrFDebjDzaxZ2lkmh3p0Xw5lYPSYMAsWdtKLa.g3hsA42liMD19NaYbg55uv1sWB4CGNWqwgsS5vSn6GM9F&x-id=UploadPart (Caused by SSLError(SSLError(5, '[SYS] unknown error (_ssl.c:2427)')))"), '(Request ID: bee9238b-3407-43f6-aed6-ad7491e71599)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/36/af/36af92bf533b205e32a4a4

TrainOutput(global_step=50, training_loss=0.1247582721710205, metrics={'train_runtime': 20615.5887, 'train_samples_per_second': 0.071, 'train_steps_per_second': 0.002, 'total_flos': 583099284705840.0, 'train_loss': 0.1247582721710205, 'epoch': 5.0})

In [16]:
#forward inference
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = AutoModelForSequenceClassification.from_pretrained("mpham8/ModernBERT-domain-classifier")

classifier = pipeline(
    task="text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0
)

Device set to use cpu


In [21]:
test_texts = test_df["text"].tolist() 
predictions = classifier(test_texts)
print(predictions)

# Get true labels from test set
truelabels_ls = test_df["label"].tolist()
predlabels_ls = [int(pred["label"][-1]) for pred in predictions]  # Extracts the label number from "LABEL_X"


[{'label': '2', 'score': 0.7018373608589172}, {'label': '2', 'score': 0.9893173575401306}, {'label': '2', 'score': 0.62039715051651}, {'label': '1', 'score': 0.870729923248291}, {'label': '1', 'score': 0.543232798576355}, {'label': '2', 'score': 0.5413481593132019}, {'label': '2', 'score': 0.9983381032943726}, {'label': '2', 'score': 0.9999887943267822}, {'label': '2', 'score': 0.999983549118042}, {'label': '2', 'score': 0.9999817609786987}, {'label': '2', 'score': 0.9999958276748657}, {'label': '2', 'score': 0.9988155364990234}, {'label': '2', 'score': 0.9999511241912842}, {'label': '2', 'score': 0.9399980902671814}, {'label': '2', 'score': 0.7595450282096863}, {'label': '2', 'score': 0.9993185997009277}]


In [22]:
accuracy = accuracy_score(truelabels_ls, predlabels_ls)
precision = precision_score(truelabels_ls, predlabels_ls, average='weighted')
recall = recall_score(truelabels_ls, predlabels_ls, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}") 
print(f"Recall: {recall:.2f}")

report = classification_report(truelabels_ls, predlabels_ls, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df = report_df.round(2) 

report_df


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.75
Precision: 0.63
Recall: 0.75


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,3.0
1,1.0,0.67,0.8,3.0
2,0.71,1.0,0.83,10.0
accuracy,0.75,0.75,0.75,0.75
macro avg,0.57,0.56,0.54,16.0
weighted avg,0.63,0.75,0.67,16.0


In [24]:
# Get counts of each label in labeled_df
label_counts = labeled_df['label'].value_counts().sort_index()
print("\nLabel distribution in labeled data:")
for label, count in label_counts.items():
    print(f"Label {label}: {count}")



Label distribution in labeled data:
Label 0: 5
Label 1: 9
Label 2: 296
