# Notebook

## Setting up

In [31]:
import numpy as np
import pandas as pd
import torch
import warnings

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from datasets import Dataset
from datasets import DatasetDict
from datasets import load_metric

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import TextClassificationPipeline
from transformers import pipeline

# suppress FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

In [32]:
df = pd.read_pickle("data/data_original.pkl")

# down sample data
df = df.sample(frac=0.01, random_state=1)

# rename emotions to label and map to integers
df.rename(columns={'emotions':'label'}, inplace = True)
label_map = {"sadness": 0, "joy": 1, "love": 2, "anger": 3, "fear": 4, "surprise": 5} 
df['label'] = df['label'].map(label_map)

print(df.shape)
df.head()

(4168, 2)


Unnamed: 0,text,label
98790,i wanted them to feel now i feel as though i a...,0
18398,i found myself feeling very sympathetic toward...,2
139349,ive been feeling pretty good today and tonight...,1
13416,i can use to cover my ass when i feel inadequate,0
22297,i feel like ive pissed myself again,3


In [33]:
dataset = Dataset.from_pandas(df)

# 90% train, 10% test+validation
train_test = dataset.train_test_split(test_size=0.1)

# Split the 10% test+validation set in half test, half validation
valid_test = train_test['test'].train_test_split(test_size=0.5)

# gather everyone if you want to have a single DatasetDict
train_valid_test_dataset = DatasetDict({
    'train': train_test['train'],
    'validation': valid_test['train'],
    'test': valid_test['test']
})

print(train_valid_test_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 3751
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 208
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 209
    })
})


## Fine-tuning a pre-trained Hugging Face model

We need to load:
- the pre-trained model itself
- the tokenizer associated with the model (used to preprocess the data)

In [34]:
model_name = "distilbert-base-uncased" # bert-base-uncased

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Pre-processing the data with the tokenizer

In [35]:
def preprocess_function(sample):
    return tokenizer(sample["text"], padding=True, truncation=True, return_tensors="pt")

tokenized_dataset = train_valid_test_dataset.map(preprocess_function, batched=True)

print(tokenized_dataset)

Map:   0%|          | 0/3751 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 3751
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 208
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 209
    })
})


The preprocessing transforms the data into a format that the model can understand. The model then processes the data to make predictions. This is known as **encoding**. Encoding is done in a two-step process: tokenization, followed by conversion to input IDs.

In [36]:
print(tokenized_dataset['train'][321])
tokens = tokenizer.tokenize(tokenized_dataset['train'][321]['text'])
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

{'text': 'i am today and although its been messy and nasty at times i feel its been worthwhile', 'label': 1, '__index_level_0__': 12053, 'input_ids': [101, 1045, 2572, 2651, 1998, 2348, 2049, 2042, 18307, 1998, 11808, 2012, 2335, 1045, 2514, 2049, 2042, 4276, 19927, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
['i', 'am', 'today', 'and', 'although', 'its', 'been', 'messy', 'and', 'nasty', 'at', 'times', 'i', 'feel', 'its', 'been', 'worth', '##while']
[1045, 2572, 2651, 1998, 2348, 2049, 2042, 18307, 1998, 11808, 2012, 2335, 1045, 2514, 2049, 2042, 4276, 19927]


### Model training (Fine-tuning)

In [37]:
def predict(text):
    global model, tokenizer
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
    print(predictions)
    predicted_class = torch.argmax(predictions)
    name = [i for i, j in label_map.items() if j == predicted_class.item()][0]
    print(f"Predicted class: {name}")
    
predict("I'm so sad")

tensor([[0.1512, 0.1698, 0.1898, 0.1465, 0.1934, 0.1494]],
       grad_fn=<SoftmaxBackward0>)
Predicted class: fear


In [38]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [39]:
trainer.train()

  0%|          | 0/705 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6293045282363892, 'eval_accuracy': 0.8076923076923077, 'eval_runtime': 5.6286, 'eval_samples_per_second': 36.954, 'eval_steps_per_second': 2.31, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3881596028804779, 'eval_accuracy': 0.8846153846153846, 'eval_runtime': 5.936, 'eval_samples_per_second': 35.041, 'eval_steps_per_second': 2.19, 'epoch': 2.0}
{'loss': 0.7396, 'grad_norm': 2.6462337970733643, 'learning_rate': 5.815602836879432e-06, 'epoch': 2.13}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.35088875889778137, 'eval_accuracy': 0.8894230769230769, 'eval_runtime': 6.0916, 'eval_samples_per_second': 34.145, 'eval_steps_per_second': 2.134, 'epoch': 3.0}
{'train_runtime': 1387.5924, 'train_samples_per_second': 8.11, 'train_steps_per_second': 0.508, 'train_loss': 0.5906700729478336, 'epoch': 3.0}


TrainOutput(global_step=705, training_loss=0.5906700729478336, metrics={'train_runtime': 1387.5924, 'train_samples_per_second': 8.11, 'train_steps_per_second': 0.508, 'train_loss': 0.5906700729478336, 'epoch': 3.0})

In [40]:
trainer.evaluate()

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.35088875889778137,
 'eval_accuracy': 0.8894230769230769,
 'eval_runtime': 5.9837,
 'eval_samples_per_second': 34.761,
 'eval_steps_per_second': 2.173,
 'epoch': 3.0}

In [41]:
trainer.predict(test_dataset=tokenized_dataset["test"])

  0%|          | 0/14 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[ 4.524358  , -1.0213351 , -1.2203786 , -0.6296568 , -0.8963616 ,
        -1.5222168 ],
       [ 0.28062665, -0.8328911 , -0.8364359 ,  3.884433  , -0.99547154,
        -1.7395878 ],
       [-0.32302445, -1.4956123 , -1.5812507 , -0.9422427 ,  3.411387  ,
         0.24380717],
       ...,
       [-1.6013496 ,  3.419962  ,  1.2195495 , -1.649142  , -2.2356763 ,
        -1.3491142 ],
       [ 3.1103868 , -0.05479916, -1.3691216 ,  0.03162865, -0.83487964,
        -1.7870898 ],
       [-0.11887605, -0.7707486 , -0.9350333 ,  3.9928396 , -0.9128882 ,
        -1.7485352 ]], dtype=float32), label_ids=array([0, 3, 4, 1, 0, 4, 1, 1, 4, 1, 1, 4, 0, 0, 0, 4, 1, 1, 0, 2, 2, 0,
       0, 0, 1, 3, 1, 0, 4, 4, 1, 4, 1, 1, 3, 2, 1, 4, 0, 0, 0, 2, 1, 1,
       0, 0, 2, 0, 0, 2, 1, 0, 1, 0, 5, 4, 0, 1, 1, 1, 0, 2, 1, 1, 1, 3,
       3, 2, 0, 1, 1, 2, 3, 1, 0, 3, 3, 2, 2, 0, 0, 1, 3, 1, 4, 2, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 3, 3, 3, 2, 5, 3, 3, 1, 1, 3, 1, 3, 4, 

### Save & Load Model

In [42]:
trainer.save_model()

tokenizer2 = AutoTokenizer.from_pretrained("./results")
model2 = AutoModelForSequenceClassification.from_pretrained("./results", num_labels=6)

pipe = TextClassificationPipeline(model=model2, tokenizer=tokenizer2) #, return_all_scores=True)

In [43]:
pipe("I love this place!")

[{'label': 'LABEL_2', 'score': 0.5717018246650696}]

In [44]:
pipe("Paris was very disappointing")

[{'label': 'LABEL_0', 'score': 0.9713153839111328}]

### Evaluate Model Results

In [45]:
y_pred= []
for p in tokenized_dataset['test']['text']:
    ti = tokenizer2(p, return_tensors="pt")
    out = model2(**ti)
    pred = torch.argmax(out.logits)
    y_pred.append(pred)   # our labels are already 0 and 1
    
y_test = tokenized_dataset['test']['label']

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1: ', f1_score(y_test, y_pred, average='macro'))

[[61  0  0  1  0  0]
 [ 0 68  2  1  0  0]
 [ 3  2 16  1  0  0]
 [ 3  0  0 22  2  0]
 [ 1  0  0  0 17  1]
 [ 0  0  0  0  3  5]]
Accuracy:  0.9043062200956937
Precision:  0.8739061483179129
Recall:  0.83390697180133
F1:  0.8487847331127041
