In [None]:
%%capture
%pip install datasets[audio]==1.16.1 umap-learn==0.5.1 datasets[s3] transformers[tf,torch,sentencepiece,vision,optuna,sklearn,onnxruntime]==4.11.3

In [None]:
#hide
from utils import *
setup_chapter()

# Text Classification

## The Dataset

### A First Look at Hugging Face Datasets

In [None]:
from datasets import list_datasets

all_datasets = list_datasets()
print(f"There are {len(all_datasets)} datasets currently available on the Hub")
print(f"The first 10 are: {all_datasets[:10]}")

In [None]:
# hide_output
from datasets import load_dataset

emotions = load_dataset("emotion")

In [None]:
emotions

In [None]:
train_ds = emotions["train"]
train_ds

In [None]:
len(train_ds)

In [None]:
train_ds[0]

In [None]:
train_ds.column_names

In [None]:
print(train_ds.features)

In [None]:
print(train_ds[:5])

In [None]:
print(train_ds["text"][:5])

### From Datasets to DataFrames

In [None]:
import pandas as pd

emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head()

In [None]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df.head()

### Looking at the Class Distribution

In [None]:
import matplotlib.pyplot as plt

df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

### How Long Are Our Tweets?

In [None]:
df["Words Per Tweet"] = df["text"].str.split().apply(len)
df.boxplot("Words Per Tweet", by="label_name", grid=False, showfliers=False,
           color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
emotions.reset_format()

## From Text to Tokens

### Character Tokenization

In [None]:
text = "Tokenizing text is a core task of NLP."
tokenized_text = list(text)
print(tokenized_text)

In [None]:
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}
print(token2idx)

In [None]:
input_ids = [token2idx[token] for token in tokenized_text]
print(input_ids)

In [None]:
categorical_df = pd.DataFrame(
    {"Name": ["Bumblebee", "Optimus Prime", "Megatron"], "Label ID": [0,1,2]})
categorical_df

In [None]:
pd.get_dummies(categorical_df["Name"])

In [None]:
import torch
import torch.nn.functional as F

input_ids = torch.tensor(input_ids)
one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))
one_hot_encodings.shape

In [None]:
print(f"Token: {tokenized_text[0]}")
print(f"Tensor index: {input_ids[0]}")
print(f"One-hot: {one_hot_encodings[0]}")

### Word Tokenization

In [None]:
tokenized_text = text.split()
print(tokenized_text)

### Subword Tokenization

In [None]:
# hide_output
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.model_max_length

In [None]:
tokenizer.model_input_names

### Tokenizing the Whole Dataset

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
print(tokenize(emotions["train"][:2]))

In [None]:
#hide_input
tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))
data = sorted(tokens2ids, key=lambda x : x[-1])
df = pd.DataFrame(data, columns=["Special Token", "Special Token ID"])
df.T

In [None]:
# hide_output
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

In [None]:
print(emotions_encoded["train"].column_names)

## Training a Text Classifier

### Transformers as Feature Extractors

#### Using pretrained models

In [None]:
# hide_output
from transformers import AutoModel

model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

#### Extracting the last hidden states

In [None]:
text = "this is a test"
inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}")

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

In [None]:
outputs.last_hidden_state.size()

In [None]:
outputs.last_hidden_state[:,0].size()

In [None]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
emotions_encoded.set_format("torch", 
                            columns=["input_ids", "attention_mask", "label"])

In [None]:
#hide_output
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)

In [None]:
emotions_hidden["train"].column_names

#### Creating a feature matrix

In [None]:
import numpy as np

X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])
X_train.shape, X_valid.shape

#### Visualizing the training set

In [None]:
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler

# Scale features to [0,1] range
X_scaled = MinMaxScaler().fit_transform(X_train)
# Initialize and fit UMAP
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
# Create a DataFrame of 2D embeddings
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = y_train
df_emb.head()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(7,5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
labels = emotions["train"].features["label"].names

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"label == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

#### Training a simple classifier


In [None]:
#hide_output
# We increase `max_iter` to guarantee convergence 
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)

In [None]:
lr_clf.score(X_valid, y_valid)

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()
    
y_preds = lr_clf.predict(X_valid)
plot_confusion_matrix(y_preds, y_valid, labels)

### Fine-Tuning Transformers

#### Loading a pretrained model

In [None]:
# hide_output
from transformers import AutoModelForSequenceClassification

num_labels = 6
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

#### Defining the performance metrics

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

#### Training the model

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from huggingface_hub import HfFolder

username  = 'simonmesserli' #replace with your own username from hugging face.
hub_token = HfFolder.get_token()

### Training with SageMaker

In [None]:
import sagemaker.huggingface
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

s3_prefix = 'samples/datasets/02_classification'

train_dataset=emotions_encoded["train"]
eval_dataset=emotions_encoded["validation"]

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path, fs=s3)

# save eval_dataset to s3
eval_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/validation'
eval_dataset.save_to_disk(eval_input_path, fs=s3)

In [None]:
!pygmentize ./scripts/02_classification_train.py

In [None]:
from sagemaker.huggingface import HuggingFace
import time

batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"

# hyperparameters, which are passed into the training job
hyperparameters={'model_id':model_ckpt,
                 'num_train_epochs':2,
                 'learning_rate':2e-5,
                 'per_device_train_batch_size':batch_size,
                 'per_device_eval_batch_size':batch_size,
                 'learning_rate':2e-5,
                 'weight_decay':0.01,
                 'evaluation_strategy':"epoch",
                 'disable_tqdm':False,
                 'logging_steps':logging_steps,
                 'push_to_hub':True,
                 'hub_model_id':username + '/' + model_name,
                 'hub_strategy':"every_save",
                 'hub_token':hub_token
                }




In [None]:
# define Training Job Name 
job_name = f'nlp-book-sagemaker-02classificaton-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = '02_classification_train.py', # fine-tuning script used in training jon
    source_dir           = './scripts',                  # directory where fine-tuning script is stored
    instance_type        = 'ml.p3.2xlarge',              # instances type used for the training job
    instance_count       = 1,                            # the number of instances used for training
    base_job_name        = job_name,                     # the name of the training job
    role                 = role,                         # IAM role used in training job to access AWS ressources, e.g. Amazon S3
    transformers_version = '4.11',                       # the transformers version used in the training job
    pytorch_version      = '1.9',                        # the pytorch_version version used in the training job
    py_version           = 'py38',                       # the python version used in the training job
    hyperparameters      = hyperparameters,              # the hyperparameter used for running the training job
)

In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {
    'train': training_input_path,
    'test': eval_input_path
}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

The logs can be found in Amazon CloudWatch: https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FTrainingJobs

In [None]:
# the model is saved in the S3 bucket and was also pushed to the hugging face hub.
print(huggingface_estimator.model_data)

In [None]:
from transformers import Trainer, AutoModel

# we load the model from the hub to the trainer and do further analyses.

model_finetuned = AutoModelForSequenceClassification.from_pretrained('simonmesserli' + '/' + model_name)

trainer = Trainer(model = model_finetuned)

### Deploy model with SageMaker Endpoint

In [None]:
predictor = huggingface_estimator.deploy(1,"ml.g4dn.xlarge")

In [None]:
custom_tweet = {"inputs" : "I saw a movie today and it was really good."}
predictor.predict(custom_tweet)

After running your requests, make sure to delete your endpoint.

In [None]:
predictor.delete_endpoint()

In [None]:
# hide_output
preds_output = trainer.predict(emotions_encoded["validation"])

In [None]:
preds_output.metrics

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)

In [None]:
plot_confusion_matrix(y_preds, y_valid, labels)

#### Error analysis

In [None]:
from torch.nn.functional import cross_entropy

def forward_pass_with_label(batch):
    # Place all input tensors on the same device as the model
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}

    with torch.no_grad():
        output = model(**inputs)
        pred_label = torch.argmax(output.logits, axis=-1)
        loss = cross_entropy(output.logits, batch["label"].to(device), 
                             reduction="none")

    # Place outputs on CPU for compatibility with other dataset columns   
    return {"loss": loss.cpu().numpy(), 
            "predicted_label": pred_label.cpu().numpy()}

In [None]:
#hide_output
# Convert our dataset back to PyTorch tensors
emotions_encoded.set_format("torch", 
                            columns=["input_ids", "attention_mask", "label"])
# Compute loss values
emotions_encoded["validation"] = emotions_encoded["validation"].map(
    forward_pass_with_label, batched=True, batch_size=16)

In [None]:
emotions_encoded.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = emotions_encoded["validation"][:][cols]
df_test["label"] = df_test["label"].apply(label_int2str)
df_test["predicted_label"] = (df_test["predicted_label"]
                              .apply(label_int2str))

In [None]:
#hide_output
df_test.sort_values("loss", ascending=False).head(10)

In [None]:
#hide_output
df_test.sort_values("loss", ascending=True).head(10)

#### Saving and sharing the model

In [None]:
#hide_output
from transformers import pipeline

# Change `simonmesserli` to your Hub username
model_id = "simonmesserli/distilbert-base-uncased-finetuned-emotion"
classifier = pipeline("text-classification", model=model_id)

In [None]:
custom_tweet = "I saw a movie today and it was really good."
preds = classifier(custom_tweet, return_all_scores=True)

In [None]:
preds_df = pd.DataFrame(preds[0])
plt.bar(labels, 100 * preds_df["score"], color='C0')
plt.title(f'"{custom_tweet}"')
plt.ylabel("Class probability (%)")
plt.show()

## Conclusion