# Finetining DistilRoberta MLM with Emotion dialogue 

Dated: 30.04.2024

Contains full script in fine-tuning **Distil Roberta Mask Language Model** on emotion dialogue datasets. 

Method description:
- Special masks: Added special masks to the language model that maps to emotion labels.
- Data transformation process: the dataset was presented to the language model as pair of utterances where the emotion tokens were masked with the special mask token. 



In [1]:
"""


Fine-tuning MLM to basic with specific prompts.
Contains:

"""
import os
import warnings
from datetime import datetime
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    set_seed,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

# Suppress warnings
warnings.filterwarnings("ignore")

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

dataset_id = 'li2017dailydialog/daily_dialog'
model_id = "distilbert/distilroberta-base"

set_seed(42)

# load tokenizer and model to fine-tune
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForMaskedLM.from_pretrained(model_id, output_attentions=True)

# setup default datacollator for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.20, return_tensors="pt")

classification_mapping = {
    "act": {
        0: "unknown",
        1: "inform",
        2: "question",
        3: "directive",
        4: "commissive"
    },
    "emotion": {
        0: "neutral",
        1: "anger",
        2: "disgust",
        3: "fear",
        4: "happiness",
        5: "sadness",
        6: "surprise"
    }
}

DIALOG_PROMPT = """User: [Emotion: {emotion_label}] [Action: {action_label}] {dialogue_input} \nAgent: [Emotion: {respond_emotion_label}] [Action: {respond_emotion_label}] {dialogue_response}"""

def preprocess_dataset(data):
    query = DIALOG_PROMPT.format(
        emotion_label=data["emotion"],
        action_label=data["act"],
        dialogue_input=data["dialog"],
        respond_emotion_label=data["response_emote"],
        respond_action_label=data["act"],
        dialogue_response=data["response"]
    )
    # Tokenize the prompt
    tokenized = tokenizer(query, truncation=True, padding="max_length", max_length=256, return_special_tokens_mask=False, return_attention_mask=True, return_tensors="pt")

    return tokenized

def cleaner(df):
    """Cleans the dataset to be put into prompt"""

    df['dial_id'] = df.index.values
    df = df.explode(['dialog', 'act', 'emotion'], ignore_index=True)
    df["act"] = df["act"].map(classification_mapping["act"])
    df["emotion"] = df["emotion"].map(classification_mapping["emotion"])
    df['response'] = df.groupby('dial_id')['dialog'].shift(-1)
    df['response_emote'] = df.groupby('dial_id')['emotion'].shift(-1)
    df["response"].fillna(". "+ tokenizer.pad_token, inplace=True)
    df["response_emote"].fillna("unknown", inplace=True)

    return df

data = load_dataset(dataset_id)
train_data = cleaner(data["train"].to_pandas())
valid_data = cleaner(data["validation"].to_pandas())
test_data = cleaner(data["test"].to_pandas())

train_df = Dataset.from_pandas(train_data)
valid_df = Dataset.from_pandas(valid_data)
test_df = Dataset.from_pandas(test_data)

train_dfs = train_df.map(preprocess_dataset, batched=True, remove_columns=train_df.column_names)
valid_dfs = valid_df.map(preprocess_dataset, batched=True, remove_columns=valid_df.column_names)
test_dfs = test_df.map(preprocess_dataset, batched=True, remove_columns=test_df.column_names)

model.train()

# defining save / load / HF resp paths
new_model_name = "emoDialog-distilroberta-base"
hub_model_id = f"darthfalka/{new_model_name}"

current_date = datetime.now().strftime("%d%m%Y-%H:%M:%S")
default_path = f"./modelHistory/{current_date}"
output_path = f"{default_path}/results"
logs_path = f"{default_path}/logs"
model_local_path = f"{default_path}/model"

training_args = TrainingArguments(
    # directory + hub config
    output_dir=output_path,
    logging_dir=logs_path,
    hub_token=os.getenv("HF_KEY"),
    hub_model_id=hub_model_id,
    push_to_hub=True,
    # local / save defaults e.g. how it was saved
    save_total_limit=3,
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    overwrite_output_dir=True,
    prediction_loss_only=False,
    # conditioning statement for selecting model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # batch params
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    # training params
    num_train_epochs=5,
    learning_rate=5e-05,
    warmup_steps=500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dfs,
    eval_dataset=valid_dfs,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # model_init = model ~  only set if you want to create a new instance for every training loop
    # callbacks
    # preprocess_logits_for_metrics
    # compute_metrics=compute_metrics
)

print(f'Model card id set to: {trainer.hub_model_id}')
print(f"Saving to hub as {new_model_name}")

trainer.train()
trainer.save_model(model_local_path)

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/87170 [00:00<?, ? examples/s]

Map:   0%|          | 0/8069 [00:00<?, ? examples/s]

Map:   0%|          | 0/7740 [00:00<?, ? examples/s]

Model card id set to: darthfalka/emoDialog-distilroberta-base
Saving to hub as emoDialog-distilroberta-base


Epoch,Training Loss,Validation Loss
1,0.2341,0.266738
2,0.23,0.142731
3,0.2319,0.225085
4,0.1783,0.155241
5,0.1782,0.187337


There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight', 'lm_head.decoder.bias'].


model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]