In [1]:
!pip install -q transformers datasets peft accelerate scikit-learn tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
eng_dataset = pd.read_csv('/kaggle/input/english-annotated-dataset/en-annotated.tsv', sep='\t', header=None)

In [3]:
print(eng_dataset)

                                                       0        1
0                                                  , ...        1
1                                                      !  1, 4, 7
2      ... And I don't think we need to discuss the T...     8, 1
3                            * So get up out of your bed        1
4      A confession that you hired [PERSON] ... and a...     1, 6
...                                                  ...      ...
17523                   Your opinion might be valuable .        8
17524                                      Your orders .        8
17525              Your ship's been in lots of battles .        8
17526                         Your wine , your Majesty .        8
17527                                            Yours ?        8

[17528 rows x 2 columns]


In [4]:
from sklearn.model_selection import train_test_split

train_val, test = train_test_split(eng_dataset, test_size=0.1, random_state=42)
train, val = train_test_split(train_val, test_size=(1/9), random_state=42)

train = train.rename(columns={0: 'text', 1: 'labels'})
test = test.rename(columns={0: 'text', 1: 'labels'})
val = val.rename(columns={0: "text", 1: "labels"})

In [5]:
print(train.head())

                                                    text labels
2235                                  ♪ For me and you ♪      5
14531  I'd be so happy if that were really true , [PE...      6
13553                                That was the hope .      5
1670                  Don't lie to me , you pepper gut .      3
5884   I know that when there's something wrong , you...   1, 4


In [6]:
print(test.head())

                                                    text labels
3282   But are they telling me , or are they asking me ?      7
3024                                         Forget me .      6
17050                The [PERSON]'s a floating mistake .      8
1039                                   Where did he go ?   2, 7
7068                                That is not true .        1


In [7]:
emotions_mapping = {1: "anger", 2: "anticipation", 3: "disgust", 4: "fear", 5: "joy", 6: "sadness", 7: "surprise", 8: "trust"}

In [8]:
def format_data(row):
  labels = row['labels']
  labels = labels.split(",")
  labels = [int(label) for label in labels]
  emotions = [emotions_mapping[label] for label in labels]

  return {
      "instruction": "Identify the emotion(s) in the following sentence:",
      "input": row['text'],
      "output": emotions
  }


train_data = train.apply(format_data, axis=1).tolist()
test_data = test.apply(format_data, axis=1).tolist()
val_data = val.apply(format_data, axis=1).tolist()


In [9]:
type(val_data)

list

In [10]:
print(train_data[0])

{'instruction': 'Identify the emotion(s) in the following sentence:', 'input': '♪ For me and you ♪', 'output': ['joy']}


In [11]:
print(test_data[0])

{'instruction': 'Identify the emotion(s) in the following sentence:', 'input': 'But are they telling me , or are they asking me ?', 'output': ['surprise']}


In [12]:
from datasets import Dataset
from transformers import AutoTokenizer

# Convert lists to Hugging Face datasets
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)
val_dataset = Dataset.from_list(val_data)

# Load tokenizer
model_name = "google/mt5-base" 
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(batch):
    # Combine instruction and input
    combined_texts = [f"{inst} {inp}" for inst, inp in zip(batch["instruction"], batch["input"])]

    # Convert output lists to strings (important!)
    output_texts = [", ".join(output) for output in batch["output"]]

    # Tokenize inputs
    model_inputs = tokenizer(
        combined_texts,
        truncation=True,
        padding="max_length",
        max_length=256
    )

    # Tokenize outputs
    labels = tokenizer(
        output_texts,
        truncation=True,
        padding="max_length",
        max_length=64
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/14022 [00:00<?, ? examples/s]

Map:   0%|          | 0/1753 [00:00<?, ? examples/s]

Map:   0%|          | 0/1753 [00:00<?, ? examples/s]

In [13]:
print(tokenized_train[0])

{'instruction': 'Identify the emotion(s) in the following sentence:', 'input': '♪ For me and you ♪', 'output': ['joy'], 'input_ids': [48675, 30077, 287, 83802, 312, 263, 271, 281, 287, 259, 9877, 259, 98923, 267, 259, 6990, 1102, 416, 305, 521, 259, 6990, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1

In [14]:
print(tokenized_test[0])

{'instruction': 'Identify the emotion(s) in the following sentence:', 'input': 'But are they telling me , or are they asking me ?', 'output': ['surprise'], 'input_ids': [48675, 30077, 287, 83802, 312, 263, 271, 281, 287, 259, 9877, 259, 98923, 267, 4837, 418, 287, 276, 259, 47612, 416, 259, 261, 631, 418, 287, 276, 259, 65925, 416, 259, 291, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [15]:
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "k", "v", "o"],  
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)


2025-10-17 12:16:33.511724: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760703393.747349      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760703393.817348      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [16]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [17]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir = "/kaggle/working/transformer_outputs"


# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	auto_find_batch_size=True,
    learning_rate=2e-4, 
    num_train_epochs=15,
    # num_train_epochs=1,
    
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=1,
    
    logging_dir='/kaggle/working/logs',
    logging_strategy="steps",
    logging_steps=500,
    report_to="tensorboard", 
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset =tokenized_val
)

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [18]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,8.5828,0.179942
2,0.068,0.048352
3,0.057,0.046455
4,0.0513,0.043719
5,0.0502,0.041622
6,0.0503,0.040404
7,0.0462,0.03981
8,0.0448,0.039646
9,0.0441,0.038896
10,0.0431,0.038645




TrainOutput(global_step=13155, training_loss=0.37757559112612626, metrics={'train_runtime': 17328.2632, 'train_samples_per_second': 12.138, 'train_steps_per_second': 0.759, 'total_flos': 1.272410020380672e+17, 'train_loss': 0.37757559112612626, 'epoch': 15.0})

In [19]:
peft_model_id="path_to_trained_model"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

('path_to_trained_model/tokenizer_config.json',
 'path_to_trained_model/special_tokens_map.json',
 'path_to_trained_model/spiece.model',
 'path_to_trained_model/added_tokens.json',
 'path_to_trained_model/tokenizer.json')

In [20]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

peft_model_id = "path_to_trained_model"
config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]



Peft model loaded


In [21]:
# use the first sample of the test set
import torch

sample = tokenized_test[20]

input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device)


# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, max_new_tokens=10, do_sample=True, top_p=0.9)
print(f"input sentence: {sample['input']}\n{'---'* 20}")

print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")

input sentence: Help yourself .
------------------------------------------------------------
summary:
anger


In [22]:
true_labels = [sample['output'] for sample in tokenized_test]
predict_labels = []

for sample in tokenized_test:
  input_ids = torch.tensor(sample['input_ids']).unsqueeze(0).to(model.device)
  with torch.inference_mode():
    outputs = model.generate(input_ids=input_ids, max_new_tokens=10, do_sample=True, top_p=0.9)
    predict_text = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    predict_labels.append(predict_text.split(", "))



In [23]:
print(predict_labels)

[['anticipation', 'fear'], ['anger'], ['anger'], ['surprise'], ['surprise'], ['anger'], ['fear'], ['anticipation'], ['anger'], ['anger'], ['anger', 'anticipation'], ['joy'], ['anger'], ['anger'], ['fear'], ['sadness'], ['anger', 'disgust'], ['fear'], ['anger'], ['disgust'], ['trust'], ['anger'], ['surprise'], ['anger'], ['fear'], ['surprise'], ['sadness'], ['anticipation'], ['anger'], ['surprise'], ['trust'], ['joy'], ['anticipation'], ['joy'], ['anger'], ['sadness'], ['trust'], ['fear'], ['surprise'], ['anticipation'], ['anger', 'disgust'], ['anger'], ['anger', 'anticipation'], ['trust'], ['anticipation'], ['disgust'], ['fear'], ['trust', 'joy', 'surprise'], ['disgust'], ['trust'], ['trust'], ['joy'], ['sadness'], ['joy'], ['sadness', 'surprise'], ['surprise'], ['joy'], ['anticipation', 'joy'], ['trust', 'anticipation'], ['joy'], ['anger'], ['anger'], ['anticipation'], ['sadness'], ['joy'], ['joy'], ['anticipation', 'joy', 'surprise'], ['anticipation', 'surprise'], ['anticipation'], [

In [24]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=list(emotions_mapping.values()))
true_binarized = mlb.fit_transform(true_labels)
pred_binarized = mlb.transform(predict_labels)



In [25]:
from sklearn.metrics import f1_score, jaccard_score

micro_f1 = f1_score(true_binarized, pred_binarized, average='micro')
jaccard = jaccard_score(true_binarized, pred_binarized, average='samples')

print(f"Micro F1-score: {micro_f1:.4f}")
print(f"Jaccard Index: {jaccard:.4f}")


Micro F1-score: 0.3847
Jaccard Index: 0.3480


In [26]:
from sklearn.metrics import precision_score, recall_score, hamming_loss

macro_f1 = f1_score(true_binarized, pred_binarized, average='macro')
precision = precision_score(true_binarized, pred_binarized, average='micro')
recall = recall_score(true_binarized, pred_binarized, average='micro')
hamming = hamming_loss(true_binarized, pred_binarized)

print(f"Macro F1-score: {macro_f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Hamming Loss: {hamming:.4f}")


Macro F1-score: 0.3795
Precision: 0.3881
Recall: 0.3814
Hamming Loss: 0.1930
