### loading the dataset and normalizing the GENZ key text

In [1]:
# STEP 1: Load Slang Dataset
# ------------------------
from datasets import load_dataset
import re

slang_ds = load_dataset("MLBtrio/genz-slang-dataset")

# Build slang dictionary (Slang -> Description)
slang_dict = {row["Slang"]: row["Description"] for row in slang_ds["train"]}

# Regex-based normalization
def normalize_text(text):
    for slang, meaning in slang_dict.items():
        # \b ensures whole word replacement, case insensitive
        text = re.sub(rf"\b{re.escape(slang)}\b", meaning, text, flags=re.IGNORECASE)
    return text

# Tests
print(normalize_text("Got the job today, big W!"))        # "Got the job today, big win!"
print(normalize_text("ngl that exam was tough"))          # "not gonna lie that exam was tough"
print(normalize_text("TBH I need a break"))               # "to be honest I need a break"
print(normalize_text("She said LOL and walked away"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

all_slangs.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1779 [00:00<?, ? examples/s]

Got the job today, big Shorthand for win!
Not going to lie that exam Wild a guess tough
To be honest I need a break
She said Lots of love and walked away


### loading the emotion dataset of the GEN Z

In [2]:
from datasets import load_dataset

# Load GoEmotions (already has labels)
goemo = load_dataset("go_emotions")

# Normalize slang inside GoEmotions text
def normalize_batch(batch):
    batch["text"] = normalize_text(batch["text"])
    return batch

goemo = goemo.map(normalize_batch)


README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

### using the roberta model for the text classification

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Reinitialize classifier head for 28 labels
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=28,
    ignore_mismatched_sizes=True
)


config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([28, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([28]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### creatingt the token

In [4]:
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

goemo = goemo.map(tokenize, batched=True)
goemo.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [6]:
pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.55.3-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.55.3-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m133.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.55.2
    Uninstalling transformers-4.55.2:
      Successfully uninstalled transformers-4.55.2
Successfully installed transformers-4.55.3


In [9]:
import transformers
print(transformers.__version__)


4.55.2


In [14]:
print(goemo["train"][0])


{'labels': tensor([27]), 'input_ids': tensor([   0, 2387, 5548,  689,   16,  932,   38,  399,   75,   33,    7, 7142,
        2185,    4,    2]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}


In [22]:
# First: remove torch formatting
goemo = goemo.with_format("python")

def fix_labels(example):
    lbl = example["labels"]

    if isinstance(lbl, list):        # e.g. [27]
        example["labels"] = int(lbl[0])
    elif isinstance(lbl, int):       # already int
        example["labels"] = lbl
    else:                            # fallback
        example["labels"] = int(lbl)

    return example

goemo_fixed = goemo.map(fix_labels)

print(goemo_fixed["train"][0]["labels"], type(goemo_fixed["train"][0]["labels"]))


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

27 <class 'int'>


In [18]:
print(goemo_fixed["train"][0]["labels"])
print(type(goemo_fixed["train"][0]["labels"]))


tensor(27)
<class 'torch.Tensor'>


In [28]:
def tokenize(batch):
    return tokenizer(
        batch["text"],                 # or the correct field in your dataset
        padding="max_length",          # pad to max length
        truncation=True,               # cut off texts longer than 512
        max_length=512                 # explicitly set limit
    )

goemo_tokenized = goemo_fixed.map(tokenize, batched=True)


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

### retraining the model

In [32]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,     # increase for better results
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="epoch",  # save every epoch
    eval_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = goemo_tokenized["train"].shuffle(seed=42).select(range(2000)),
    eval_dataset  = goemo_tokenized["validation"].shuffle(seed=42).select(range(500)),

    tokenizer=tokenizer,
)


trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.3323,1.492295
2,1.1962,1.561254
3,1.0001,1.563723
4,0.808,1.615648
5,0.7246,1.62284


TrainOutput(global_step=315, training_loss=0.9831909028310625, metrics={'train_runtime': 1247.4633, 'train_samples_per_second': 8.016, 'train_steps_per_second': 0.253, 'total_flos': 2631724769280000.0, 'train_loss': 0.9831909028310625, 'epoch': 5.0})

In [33]:
# GoEmotions labels (28 emotions)
goemo_labels = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval",
    "disgust", "embarrassment", "excitement", "fear", "gratitude",
    "grief", "joy", "love", "nervousness", "optimism", "pride",
    "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]

# Update model config
model.config.id2label = {i: label for i, label in enumerate(goemo_labels)}
model.config.label2id = {label: i for i, label in enumerate(goemo_labels)}

# Reload pipeline with new mapping
from transformers import pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

print(classifier("ngl that exam was tough"))
print(classifier("tbh I need a break"))
print(classifier("Got the job today, big W!"))


Device set to use cuda:0


[{'label': 'neutral', 'score': 0.45014968514442444}]
[{'label': 'desire', 'score': 0.3443542718887329}]
[{'label': 'excitement', 'score': 0.3062312602996826}]


In [36]:
save_dir = "/content/genz_emotion_model_2"   # absolute path in Colab
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)


('/content/genz_emotion_model_2/tokenizer_config.json',
 '/content/genz_emotion_model_2/special_tokens_map.json',
 '/content/genz_emotion_model_2/vocab.json',
 '/content/genz_emotion_model_2/merges.txt',
 '/content/genz_emotion_model_2/added_tokens.json',
 '/content/genz_emotion_model_2/tokenizer.json')

In [37]:
from google.colab import drive
drive.mount('/content/drive')

save_dir = "/content/drive/MyDrive/genz_emotion_model"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)


Mounted at /content/drive


('/content/drive/MyDrive/genz_emotion_model/tokenizer_config.json',
 '/content/drive/MyDrive/genz_emotion_model/special_tokens_map.json',
 '/content/drive/MyDrive/genz_emotion_model/vocab.json',
 '/content/drive/MyDrive/genz_emotion_model/merges.txt',
 '/content/drive/MyDrive/genz_emotion_model/added_tokens.json',
 '/content/drive/MyDrive/genz_emotion_model/tokenizer.json')