In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from datasets import load_dataset
emotions = load_dataset("emotion")

In [None]:
emotions

In [None]:
train_dataset = emotions['train']

In [None]:
train_dataset

In [None]:
train_dataset[0]

In [None]:
train_dataset[2323]

In [None]:
train_dataset.features

In [None]:
emotions.set_format(type='pandas')

In [None]:
df = emotions['train'][:]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
def label_int2str(row):
 return emotions["train"].features["label"].int2str(row)

In [None]:
df["label_name"] = df["label"].apply(label_int2str)
df.head()

In [None]:
df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
df["Words Per Tweet"] = df["text"].str.split().apply(len)
df.boxplot("Words Per Tweet", by="label_name", grid=False,
 showfliers=False, color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
emotions.reset_format()

In [None]:
t = "Tokenizing text is a core task of NLP."
tokenized_text = list(t)
print(tokenized_text)

In [None]:
token2idx = {token: idx for idx,token in enumerate(sorted(set(tokenized_text)))}
print(token2idx)

In [None]:
input_idx = [token2idx[token] for token in tokenized_text]
print(input_idx)

In [None]:
categorical_df = pd.DataFrame(
 {"Name": ["Bumblebee", "Optimus Prime", "Megatron"], "Label ID": [0,1,2]})
categorical_df

In [None]:
#one hot
pd.get_dummies(categorical_df["Name"])

In [None]:
import torch
import torch.nn.functional as F

In [None]:
input_ids = torch.tensor(input_idx)
one_hot_encoded = F.one_hot(input_ids, num_classes=len(token2idx))
one_hot_encoded.shape

In [None]:
one_hot_encoded

In [None]:
#Word Tokenization
tokenized_text = t.split()
print(tokenized_text)

In [None]:
from transformers import BertTokenizer # for bert tokonization
from transformers import AutoTokenizer # for other model --> distilbert

In [None]:
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
encoder_text = tokenizer(t)
print(encoder_text)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoder_text.input_ids)
print(tokens)

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.model_max_length

In [None]:
tokenizer.model_input_names

In [None]:
# whole dataset
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
print(tokenize(emotions["train"][:2]))

In [None]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

In [None]:
emotions_encoded

In [None]:
print(emotions_encoded["train"].column_names)

### Transformers as Feature Extractors

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
# Encode a text
text = "ROS 2 is a communication system that allows for seamless communication between the various parts of a robot."
inputs = tokenizer(text, return_tensors="pt").to(device)
print(inputs)

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
# CLS token representation (embedding vector)
cls_embedding = outputs.last_hidden_state[:,0,:]
print(cls_embedding.shape)

In [None]:
outputs.last_hidden_state[:,0].size()

In [None]:
def extract_hidden_states(batch):
  inputs = {k: v.to(device) for k,v in batch.items()
            if k in tokenizer.model_input_names}

  with torch.no_grad():
    last_hidden_state = model(**inputs).last_hidden_state

  cls_embeddings = last_hidden_state[:,0].cpu().numpy()
  return {"hidden_state": cls_embeddings}

In [None]:
emotions_encoded.set_format('torch',columns =
                            ["input_ids", "attention_mask", "label"])

In [None]:
emotions_hidden = emotions_encoded.map(extract_hidden_states,batched=True)

In [None]:
emotions_hidden["train"].column_names

In [None]:
import numpy as np
X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])
X_train.shape, X_valid.shape

In [None]:
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
# Scale features to [0,1] range
X_scaled = MinMaxScaler().fit_transform(X_train)
# Initialize and fit UMAP
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
# Create a DataFrame of 2D embeddings
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = y_train # Add the label column from y_train

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(7,5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
labels = emotions["train"].features["label"].names
for i, (label, cmap) in enumerate(zip(labels, cmaps)):
  df_emb_sub = df_emb.query(f"label == {i}")
  axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                 gridsize=20, linewidths=(0,))
  axes[i].set_title(label)


In [None]:
from sklearn.linear_model import LogisticRegression
# We increase `max_iter` to guarantee convergence
lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_valid, y_valid)

In [None]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()
y_preds = lr_clf.predict(X_valid)
plot_confusion_matrix(y_preds, y_valid, labels)

## Fine-Tuning Transformers

In [None]:
from transformers import AutoModelForSequenceClassification #not Automodel like featured-based approach it has calssification head on top the pretrained model outputs, which can be easily trained with the hbase model.
num_labels = 6
model = (AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=num_labels).to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
            num_train_epochs=2,
            learning_rate=2e-5,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            weight_decay=0.01,
            eval_strategy="epoch",
            disable_tqdm=False,
            logging_steps=logging_steps,
            push_to_hub=True,
            log_level="error")

In [None]:
from transformers import Trainer
trainer = Trainer(model=model, args=training_args,
      compute_metrics=compute_metrics,
      train_dataset=emotions_encoded["train"],
      eval_dataset=emotions_encoded["validation"],
      tokenizer=tokenizer)
trainer.train();

In [None]:
preds_output = trainer.predict(emotions_encoded["validation"])

In [None]:
preds_output.metrics

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)

In [None]:
plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
from torch.nn.functional import cross_entropy
def forward_pass_with_label(batch):
 # Place all input tensors on the same device as the model
    inputs = {k:v.to(device) for k,v in batch.items()
           if k in tokenizer.model_input_names}
    with torch.no_grad():
      output = model(**inputs)
      pred_label = torch.argmax(output.logits, axis=-1)
      loss = cross_entropy(output.logits, batch["label"].to(device),
                          reduction="none")
      # Place outputs on CPU for compatibility with other dataset columns
    return {"loss": loss.cpu().numpy(),"predicted_label": pred_label.cpu().numpy()}

In [None]:
# Convert our dataset back to PyTorch tensors
emotions_encoded.set_format("torch",
 columns=["input_ids", "attention_mask", "label"])
# Compute loss values
emotions_encoded["validation"] = emotions_encoded["validation"].map(
 forward_pass_with_label, batched=True, batch_size=16)

In [None]:
emotions_encoded.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = emotions_encoded["validation"][:][cols]
df_test["label"] = df_test["label"].apply(label_int2str)
df_test["predicted_label"] = (df_test["predicted_label"].apply(label_int2str))

In [None]:
df_test.sort_values("loss", ascending=False).head(10) # higherst losess

In [None]:
df_test.sort_values("loss", ascending=True).head(10)

In [None]:
trainer.push_to_hub(commit_message="Training completed!")

In [None]:
#loading my model from hugging face
from transformers import pipeline
# Change `transformersbook` to your Hub username
model_id = "nithinganesh/distilbert-base-uncased-finetuned-emotion"
classifier = pipeline("text-classification", model=model_id)

In [None]:
custom_tweet = "I saw a movie today and it was really good."
preds = classifier(custom_tweet, return_all_scores=True)

In [None]:
preds_df = pd.DataFrame(preds[0])
plt.bar(labels, 100 * preds_df["score"], color='C0')
plt.title(f'"{custom_tweet}"')
plt.ylabel("Class probability (%)")
plt.show()