# 2 Text Classification

Since 2022 (publication date), a lot has changed for Huggingface. There are thousands of datasets at the moment, therefore listing all datasets is kind of meaningless.

Yet if you want to perform this operation, current approach is using `huggingface_hub` method that return a generator

In [None]:
from huggingface_hub import list_datasets

all_datasets = list_datasets()

Different data types have specific methods.

In [None]:
from datasets import load_dataset

emotions = load_dataset("emotion")
methods = [i for i in dir(emotions["train"].features["label"]) if "_" not in i]
print(f"{methods=}")

emotions["train"].features["label"].str2int("joy")

In [None]:
emotions.set_format(type="pandas")
df = emotions["train"][:]
df

In [None]:
def label2str(label):
    return emotions["train"].features["label"].int2str(label)


df["label_name"] = df.label.apply(label2str)
df

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 5), dpi=300)

df.label_name.value_counts(ascending=True).plot.barh(ax=ax)

plt.title("Label Frequency")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 5), dpi=300)

df["token_size"] = df.text.str.split().str.len()
df.boxplot(column="token_size", by="label_name", grid=False, ax=ax, color="black")

plt.suptitle("")
plt.xlabel("Label Name")
plt.ylabel("Token Size")
plt.title("Token Size Distribution by Labels")
plt.show()

In [None]:
emotions.reset_format()

## Tokenization

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
text = "Tokenizing text is a core task of NLP."
print(f"{text=}")

text_dict = tokenizer(text)
text_ids = text_dict["input_ids"]
print(f"{text_ids=}")

tokens = tokenizer.convert_ids_to_tokens(text_ids)
print(f"{tokens=}")

token2string = tokenizer.convert_tokens_to_string(tokens)
print(f"{token2string=}")

In [None]:
print(f"{tokenizer.vocab_size=} and {tokenizer.model_max_length=} and {tokenizer.model_input_names=}")

In [None]:
def tokenize_batch(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


print(tokenize_batch(emotions["train"][:2]))

Generally, we don't tokenize the texts without batch_size since the whole dataset will be padded to the longest sequence in the entire dataset. However, in this example we will use hidden states as features and to generate these we inference in batches which requires the same sizes. 

In [None]:
emotions_encoded = emotions.map(tokenize_batch, batched=True, batch_size=None, load_from_cache_file=False)
emotions_encoded

In [None]:
from transformers import AutoModel
import torch

model_ckpt = "distilbert-base-uncased"
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_checkpoint).to(device)
print(model)

In [None]:
text = "this is a test"
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs)
outputs

In [None]:
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Dimensions for `last_hidden_state` is such that `(B, Si, D)` B is for batch, Si for sequence index and D is for embedding dimension. Zeroth sequence element is `[CLS]` token.

For multi dimensional arrays, leaving dimensions imply take all of them. For an example with 4 dimensions `[:, 0] == [:, 0, :, :]`

In [None]:
def extract_hidden_states(batch):
    inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    last_hidden_state = last_hidden_state[:, 0, :].cpu().numpy()
    return {"hidden_state": last_hidden_state}

In [None]:
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)

In [None]:
import numpy as np

x_train = np.array(emotions_hidden["train"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])

x_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_valid = np.array(emotions_hidden["validation"]["label"])

x_test = np.array(emotions_hidden["test"]["hidden_state"])
y_test = np.array(emotions_hidden["test"]["label"])

In [None]:
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

x_scaled = MinMaxScaler().fit_transform(x_train)
mapper = UMAP(n_components=2, metric="cosine").fit(x_scaled)
df_emb = pd.DataFrame(mapper.embedding_, columns=["x", "y"])
df_emb["label"] = y_train
df_emb.head()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 3, figsize=(7, 5), dpi=500)
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
labels = emotions["train"].features["label"].names

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb[df_emb.label == i]
    axes[i].hexbin(df_emb_sub["x"], df_emb_sub["y"], cmap=cmap, gridsize=20, linewidths=(20,))
    axes[i].set_title(label)
    axes[i].set_xticks([])
    axes[i].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(x_train, y_train)
lr_clf.score(x_valid, y_valid)

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(x_train, y_train)
dummy_clf.score(x_valid, y_valid)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix


def draw_confusion_matrix(y_true, y_preds, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    disp = ConfusionMatrixDisplay(cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", colorbar=False)
    plt.title("Normalized Confusion Matrix")
    plt.show()


y_pred = lr_clf.predict(x_valid)

In [None]:
draw_confusion_matrix(y_valid, y_pred, labels)

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = len(emotions["train"].features["label"].names)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to(device)
model

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from transformers import TrainingArguments
import os

os.makedirs("./models", exist_ok=True)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"models/{model_ckpt}-finetuned-emotion"

training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error",
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=emotions_encoded["train"],
    eval_dataset=emotions_encoded["validation"],
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
preds_output = trainer.predict(emotions_encoded["validation"])
preds_output.metrics

In [None]:
print(preds_output)

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)
draw_confusion_matrix(y_valid, y_preds, labels)

In [None]:
from torch.nn.functional import cross_entropy


def forward_pass_with_label(batch):
    inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        output = model(**inputs)
        pred_label = torch.argmax(output.logits, axis=-1)
        loss = cross_entropy(output.logits, batch["label"].to(device), reduction="none")

    return {"loss": loss.cpu().numpy(), "predicted_label": pred_label.cpu().numpy()}


In [None]:
emotions_encoded.set_format("torch", columns=["label", "input_ids", "attention_mask"])
emotions_encoded["validation"] = emotions_encoded["validation"].map(
    forward_pass_with_label, batch_size=64, batched=True
)

In [None]:
pd.set_option("display.max_colwidth", None)


emotions_encoded.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = emotions_encoded["validation"][:][cols]
df_test["label_name"] = df_test.label.apply(label2str)
df_test["predicted_label_name"] = df_test.predicted_label.apply(label2str)

df_test

In [None]:
df_test.sort_values("loss")

In [None]:
from transformers import pipeline

model_id = "models/distilbert-base-uncased-finetuned-emotion/checkpoint-500"
classifier = pipeline(task="text-classification", model=model_id)
classifier

In [None]:
scores = classifier("This is abysmal! I have never seen such a pointless movie in my life.", return_all_scores=True)

scores = pd.DataFrame(scores[0])
scores
plt.bar(labels, 100 * scores["score"], color="C0")

In [None]:
result = trainer.predict(emotions_encoded["train"].select(range(10)))
print(result)