# Fine-Tuning a transformer for a Sentiment Analysis Web App
---
## Setup

In [22]:
from pathlib import Path

import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import transformers
from transformers import DistilBertModel, DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW
import torch 

In [2]:
RANDOM_SEED=42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device=torch.device("cuda:0"if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Data Exploration

The IMDb dataset is retrieved from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [3]:
def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text(encoding="utf8"))
            labels.append(0 if label_dir is "neg" else 1)

    return texts, labels

train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

In [4]:
print("IMDb reviews (combined): train = {}, test = {}".format(len(train_texts), len(test_texts)))

IMDb reviews (combined): train = 25000, test = 25000


In [5]:
print(train_texts[50])
print(train_labels[50])

I find it so amazing that even after all these years, we are STILL talking about this movie! Obviously this movie wasn't THAT bad or else people wouldn't even BOTHER to talk about it. I personally enjoyed this film immensly, and still do! I guess this film isn't for everyone, but it certainly did touch the hearts of many. <br /><br />As for those that think that this film is "overrated" or "over-hyped"...well, we only have the movie-going public to thank for that! lol* You see, it's not CRITICS/article writers that make a film "HUGE" or a "HIT" with the general movie-going public. PEOPLE make the film a huge success. With Titanic, everyone was in awe. Let's face it, a film like this had never been made before. At least not with the type of special effects needed to really capture the essence of the ship actually sinking. This film is so accurate that even James Cameron timed the actual sinking of the ship in the film with the REAL sinking that fateful day in April 1912. Even the silver

# Data Preprocessing

In [6]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [9]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

## Sentiment Classification with DistilBERT and Hugging Face

We select DistilBert for its limited size

In [10]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,
    do_eval = True,
    evaluate_during_training =True,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1250.0, style=ProgressStyle(description_w…

{'loss': 0.6849543571472168, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.008, 'step': 10}
{'loss': 0.6793449401855469, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.016, 'step': 20}
{'loss': 0.6858637809753418, 'learning_rate': 3e-06, 'epoch': 0.024, 'step': 30}
{'loss': 0.6798263549804687, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.032, 'step': 40}
{'loss': 0.6834714889526368, 'learning_rate': 5e-06, 'epoch': 0.04, 'step': 50}
{'loss': 0.6793785095214844, 'learning_rate': 6e-06, 'epoch': 0.048, 'step': 60}
{'loss': 0.6545303344726563, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.056, 'step': 70}
{'loss': 0.6330799102783203, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.064, 'step': 80}
{'loss': 0.5784263610839844, 'learning_rate': 9e-06, 'epoch': 0.072, 'step': 90}
{'loss': 0.4934104919433594, 'learning_rate': 1e-05, 'epoch': 0.08, 'step': 100}
{'loss': 0.422637939453125, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.088, 'step': 110}
{'loss': 0.



{'loss': 0.23261871337890624, 'learning_rate': 4.984615384615385e-05, 'epoch': 0.408, 'step': 510}
{'loss': 0.31630096435546873, 'learning_rate': 4.969230769230769e-05, 'epoch': 0.416, 'step': 520}
{'loss': 0.289892578125, 'learning_rate': 4.953846153846154e-05, 'epoch': 0.424, 'step': 530}
{'loss': 0.25572662353515624, 'learning_rate': 4.9384615384615384e-05, 'epoch': 0.432, 'step': 540}
{'loss': 0.2310638427734375, 'learning_rate': 4.923076923076924e-05, 'epoch': 0.44, 'step': 550}
{'loss': 0.31404266357421873, 'learning_rate': 4.907692307692308e-05, 'epoch': 0.448, 'step': 560}
{'loss': 0.25184478759765627, 'learning_rate': 4.892307692307693e-05, 'epoch': 0.456, 'step': 570}
{'loss': 0.23512115478515624, 'learning_rate': 4.876923076923077e-05, 'epoch': 0.464, 'step': 580}
{'loss': 0.3182830810546875, 'learning_rate': 4.861538461538462e-05, 'epoch': 0.472, 'step': 590}
{'loss': 0.24281463623046876, 'learning_rate': 4.846153846153846e-05, 'epoch': 0.48, 'step': 600}
{'loss': 0.2389083

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=79.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.24497871668338775, 'epoch': 0.8, 'step': 1000}
{'loss': 0.2187286376953125, 'learning_rate': 4.2153846153846156e-05, 'epoch': 0.808, 'step': 1010}
{'loss': 0.2787933349609375, 'learning_rate': 4.2e-05, 'epoch': 0.816, 'step': 1020}
{'loss': 0.26097412109375, 'learning_rate': 4.1846153846153846e-05, 'epoch': 0.824, 'step': 1030}
{'loss': 0.2312164306640625, 'learning_rate': 4.169230769230769e-05, 'epoch': 0.832, 'step': 1040}
{'loss': 0.3315216064453125, 'learning_rate': 4.1538461538461544e-05, 'epoch': 0.84, 'step': 1050}
{'loss': 0.2529022216796875, 'learning_rate': 4.1384615384615386e-05, 'epoch': 0.848, 'step': 1060}
{'loss': 0.2635345458984375, 'learning_rate': 4.1230769230769234e-05, 'epoch': 0.856, 'step': 1070}
{'loss': 0.1777740478515625, 'learning_rate': 4.1076923076923076e-05, 'epoch': 0.864, 'step': 1080}
{'loss': 0.353619384765625, 'learning_rate': 4.0923076923076925e-05, 'epoch': 0.872, 'step': 1090}
{'loss': 0.26666259765625, 'learning_rate': 4.0769230769

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1250.0, style=ProgressStyle(description_w…

{'loss': 0.1304107666015625, 'learning_rate': 3.830769230769231e-05, 'epoch': 1.008, 'step': 1260}
{'loss': 0.1747039794921875, 'learning_rate': 3.8153846153846153e-05, 'epoch': 1.016, 'step': 1270}
{'loss': 0.15675048828125, 'learning_rate': 3.8e-05, 'epoch': 1.024, 'step': 1280}
{'loss': 0.186334228515625, 'learning_rate': 3.784615384615385e-05, 'epoch': 1.032, 'step': 1290}
{'loss': 0.1932647705078125, 'learning_rate': 3.769230769230769e-05, 'epoch': 1.04, 'step': 1300}
{'loss': 0.15980224609375, 'learning_rate': 3.753846153846154e-05, 'epoch': 1.048, 'step': 1310}
{'loss': 0.1334136962890625, 'learning_rate': 3.738461538461538e-05, 'epoch': 1.056, 'step': 1320}
{'loss': 0.174884033203125, 'learning_rate': 3.723076923076923e-05, 'epoch': 1.064, 'step': 1330}
{'loss': 0.1698638916015625, 'learning_rate': 3.707692307692308e-05, 'epoch': 1.072, 'step': 1340}
{'loss': 0.1145416259765625, 'learning_rate': 3.692307692307693e-05, 'epoch': 1.08, 'step': 1350}
{'loss': 0.1808868408203125, 'l

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=79.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.267195578956604, 'epoch': 1.6, 'step': 2000}
{'loss': 0.1015625, 'learning_rate': 2.676923076923077e-05, 'epoch': 1.608, 'step': 2010}
{'loss': 0.188616943359375, 'learning_rate': 2.6615384615384614e-05, 'epoch': 1.616, 'step': 2020}
{'loss': 0.17333984375, 'learning_rate': 2.6461538461538466e-05, 'epoch': 1.624, 'step': 2030}
{'loss': 0.09266357421875, 'learning_rate': 2.630769230769231e-05, 'epoch': 1.6320000000000001, 'step': 2040}
{'loss': 0.2500244140625, 'learning_rate': 2.6153846153846157e-05, 'epoch': 1.6400000000000001, 'step': 2050}
{'loss': 0.141168212890625, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.6480000000000001, 'step': 2060}
{'loss': 0.244183349609375, 'learning_rate': 2.5846153846153847e-05, 'epoch': 1.6560000000000001, 'step': 2070}
{'loss': 0.071533203125, 'learning_rate': 2.5692307692307692e-05, 'epoch': 1.6640000000000001, 'step': 2080}
{'loss': 0.180364990234375, 'learning_rate': 2.5538461538461538e-05, 'epoch': 1.6720000000000002, 'st

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1250.0, style=ProgressStyle(description_w…

{'loss': 0.02349853515625, 'learning_rate': 1.9076923076923077e-05, 'epoch': 2.008, 'step': 2510}
{'loss': 0.05341796875, 'learning_rate': 1.8923076923076925e-05, 'epoch': 2.016, 'step': 2520}
{'loss': 0.01732177734375, 'learning_rate': 1.876923076923077e-05, 'epoch': 2.024, 'step': 2530}
{'loss': 0.050555419921875, 'learning_rate': 1.8615384615384616e-05, 'epoch': 2.032, 'step': 2540}
{'loss': 0.10009765625, 'learning_rate': 1.8461538461538465e-05, 'epoch': 2.04, 'step': 2550}
{'loss': 0.0193603515625, 'learning_rate': 1.830769230769231e-05, 'epoch': 2.048, 'step': 2560}
{'loss': 0.153326416015625, 'learning_rate': 1.8153846153846155e-05, 'epoch': 2.056, 'step': 2570}
{'loss': 0.0282470703125, 'learning_rate': 1.8e-05, 'epoch': 2.064, 'step': 2580}
{'loss': 0.084930419921875, 'learning_rate': 1.7846153846153846e-05, 'epoch': 2.072, 'step': 2590}
{'loss': 0.074578857421875, 'learning_rate': 1.7692307692307694e-05, 'epoch': 2.08, 'step': 2600}
{'loss': 0.068365478515625, 'learning_rate'

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=79.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.2961650329589844, 'epoch': 2.4, 'step': 3000}
{'loss': 0.006439208984375, 'learning_rate': 1.1384615384615385e-05, 'epoch': 2.408, 'step': 3010}
{'loss': 0.06851806640625, 'learning_rate': 1.123076923076923e-05, 'epoch': 2.416, 'step': 3020}
{'loss': 0.110247802734375, 'learning_rate': 1.1076923076923077e-05, 'epoch': 2.424, 'step': 3030}
{'loss': 0.1262451171875, 'learning_rate': 1.0923076923076924e-05, 'epoch': 2.432, 'step': 3040}
{'loss': 0.041033935546875, 'learning_rate': 1.0769230769230771e-05, 'epoch': 2.44, 'step': 3050}
{'loss': 0.06932373046875, 'learning_rate': 1.0615384615384616e-05, 'epoch': 2.448, 'step': 3060}
{'loss': 0.0596923828125, 'learning_rate': 1.0461538461538462e-05, 'epoch': 2.456, 'step': 3070}
{'loss': 0.158453369140625, 'learning_rate': 1.0307692307692309e-05, 'epoch': 2.464, 'step': 3080}
{'loss': 0.1104736328125, 'learning_rate': 1.0153846153846154e-05, 'epoch': 2.472, 'step': 3090}
{'loss': 0.04942626953125, 'learning_rate': 1e-05, 'epoc

TrainOutput(global_step=3750, training_loss=0.1823802734375)

In [11]:
trainer.save_model('model')

In [13]:
check=trainer.predict(test_dataset)

HBox(children=(FloatProgress(value=0.0, description='Prediction', max=391.0, style=ProgressStyle(description_w…




In [31]:
accuracy_score(test_labels,np.argmax(check.predictions,axis=1))

0.9308

DistilBert scores 93% accuracy on the test set while the LSTM scored 84%