In [None]:
!pip install datasets transformers torch scikit-learn


In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


In [None]:
from datasets import load_dataset,Dataset


In [None]:

dataset = load_dataset("Keyurjotaniya007/go-emotions-cleaned")

#  train, validation, test to DataFrame
train_df = pd.DataFrame(dataset['train'])
test_df  = pd.DataFrame(dataset['test'])

In [None]:
train_df.head()

In [None]:
import torch

# الكود ده هيقولك إذا كان PyTorch شايف الـ GPU ولا لأ
if torch.cuda.is_available():
    print(f"GPU is available! 🔥")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    print(f"GPU is NOT available. 😭 Make sure you have selected the GPU accelerator in the notebook settings.")


In [1]:
!pip install transformers accelerate -U



In [2]:
import numpy as np
import pandas as pd
import torch
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset


2025-10-20 20:06:26.046178: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760990786.068970     302 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760990786.075854     302 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [25]:
print("Loading dataset...")
dataset = load_dataset("Keyurjotaniya007/go-emotions-cleaned")
df = pd.DataFrame(dataset['train'])

df_sample = df.sample(frac=0.4, random_state=42)
print(f"Using {len(df_sample)} rows for training and validation (40% of original data).")

Loading dataset...
Using 74812 rows for training and validation (40% of original data).


In [26]:

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("Cleaning text data...")
df_sample['text'] = df_sample['text'].apply(clean_text)

Cleaning text data...


In [27]:

labels = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

train_df, val_df = train_test_split(
    df_sample,
    test_size=0.2,
    random_state=42,
    stratify=df_sample['label']
)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

In [28]:
model_name = "bert-base-uncased"
print(f"Loading model and tokenizer: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)


Loading model and tokenizer: bert-base-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

print("Tokenizing datasets...")
train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized = val_dataset.map(tokenize_function, batched=True)

Tokenizing datasets...


Map:   0%|          | 0/59849 [00:00<?, ? examples/s]

Map:   0%|          | 0/14963 [00:00<?, ? examples/s]

In [30]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1_weighted": f1}

In [31]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    fp16=True,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="epoch",
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)





  trainer = Trainer(


In [32]:
print("Starting training... 🚀")
trainer.train()
print("Training finished! ✅")


print("\nEvaluating the final model on the validation set...")
eval_results = trainer.evaluate()
print(f"Final Evaluation Results: {eval_results}")

Starting training... 🚀


Step,Training Loss
100,2.8807
200,2.5928
300,2.4372
400,2.316
500,2.2424
600,2.1819
700,2.1579
800,2.1585
900,2.1406
1000,2.1106


Training finished! ✅

Evaluating the final model on the validation set...


Final Evaluation Results: {'eval_loss': 1.99428391456604, 'eval_accuracy': 0.4119494753725857, 'eval_f1_weighted': 0.38941764902890247, 'eval_runtime': 53.7453, 'eval_samples_per_second': 278.406, 'eval_steps_per_second': 8.708, 'epoch': 3.0}
