In [None]:
from transformers import AutoTokenizer
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')


label_mapping = {'optimism':0, 'anxiety':1, 'excitement':2, 'disgust':3}
train_df['label'] = train_df['emo_label'].map(label_mapping)
val_df['label'] = val_df['emo_label'].map(label_mapping)
test_df['label'] = test_df['emo_label'].map(label_mapping)

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df[['processed', 'label']]),
    'validation': Dataset.from_pandas(val_df[['processed', 'label']]),
    'test': Dataset.from_pandas(test_df[['processed', 'label']])
})

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["processed"], padding='max_length', truncation=True, max_length=128)


tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4
)


def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir="distilbert-emotion",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

results = trainer.evaluate(tokenized_datasets['test'])
print(results)

model.save_pretrained('distilbert-emotion-model')
tokenizer.save_pretrained('distilbert-emotion-model')




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4527 [00:00<?, ? examples/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.028547,0.570922,0.584674,0.570922,0.564061
2,1.031100,0.963307,0.60461,0.605043,0.60461,0.603296
3,1.031100,1.000646,0.618794,0.633879,0.618794,0.616091
4,0.676600,1.074432,0.592199,0.601279,0.592199,0.59315
5,0.676600,1.175268,0.606383,0.614342,0.606383,0.605834


{'eval_loss': 0.9205456376075745, 'eval_accuracy': 0.6453900709219859, 'eval_precision': 0.65940593604268, 'eval_recall': 0.6453900709219859, 'eval_f1': 0.6427565339204612, 'eval_runtime': 0.5444, 'eval_samples_per_second': 1035.995, 'eval_steps_per_second': 66.127, 'epoch': 5.0}


('distilbert-emotion-model/tokenizer_config.json',
 'distilbert-emotion-model/special_tokens_map.json',
 'distilbert-emotion-model/vocab.txt',
 'distilbert-emotion-model/added_tokens.json',
 'distilbert-emotion-model/tokenizer.json')