In [1]:
!pip install -q transformers datasets



In [2]:
from collections import Counter

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

import os 

import torch
from torch import nn, optim
import transformers
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

Download dataset

In [3]:
imdb_dataset = load_dataset('imdb')

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
shuf_dataset = imdb_dataset.shuffle(seed=42)  #  shuffle so that there are different labels in our subset

In [5]:
train_texts = shuf_dataset["train"]["text"][:5000]
train_labels = shuf_dataset["train"]["label"][:5000]
test_texts = shuf_dataset["test"]["text"][:1500]
test_labels = shuf_dataset["test"]["label"][:1500]
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [6]:
Counter(test_labels), Counter(train_labels)

(Counter({1: 738, 0: 762}), Counter({0: 1996, 1: 2004}))

In [7]:
del imdb_dataset, shuf_dataset

Prepare data

In [8]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [10]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

Model

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

A joint class model with bert output type as a parameter (tasks 1, 2 and 4) 

In [12]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes, output_type='pooled'):
    super().__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.n_classes = n_classes
    self.output_type = output_type

    if self.output_type in ['pooled', 'agg_cls']:
      lin_input_size = self.bert.config.hidden_size
    elif output_type=='add_cls':
      lin_input_size = self.bert.config.hidden_size*2

    self.out = nn.Linear(lin_input_size, n_classes)
    self.loss = nn.CrossEntropyLoss()
    
  
  def forward(self, input_ids, attention_mask, token_type_ids, labels):
    last_hidden_state, pooled_output, hidden_states = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False, 
      output_hidden_states=True)
    
    if self.output_type=='pooled':  # only pooled output
      bert_output = pooled_output
    
    elif self.output_type=='add_cls':  # pooled + cls from the last layer
      bert_output = torch.cat((pooled_output, last_hidden_state[:, 0, :]), dim=1)
    
    elif self.output_type=='agg_cls':  # cls-token aggregated from 4 last layers
      cls_embs = torch.cat(tuple([hidden_states[i][:, 0, :].unsqueeze(1) for i in [-3, -2, -1, 0]]), dim=1)
      bert_output = torch.mean(cls_embs, dim=1)
      
    output = self.drop(bert_output)
    logits = self.out(output)
    loss = self.loss(logits.view(-1, self.n_classes), labels.view(-1))
    return (loss, logits)

In [13]:
# torch.cuda.empty_cache()
# del model

A basic model, task 1

In [14]:
model = SentimentClassifier(n_classes=2, output_type='pooled')
model = model.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
os.makedirs("./results", exist_ok=True)
os.makedirs("./logs", exist_ok=True)  

training_args = TrainingArguments(
    output_dir='./results',         
    num_train_epochs=2,             
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=16,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    report_to=None
)

In [16]:
def compute_metrics(pred):
    print(pred)
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Train and evaluate task 1 model

In [17]:
#  disable wandb in kaggle
import wandb
wandb.init(mode="disabled")



In [18]:
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,            
    compute_metrics = compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
10,0.6851
20,0.7172
30,0.6785
40,0.6887
50,0.6728
60,0.6812
70,0.6758
80,0.6444
90,0.6325
100,0.6125


Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./results/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1000, training_loss=0.3583272613286972, metrics={'train_runtime': 468.3797, 'train_samples_per_second': 17.08, 'train_steps_per_second': 2.135, 'total_flos': 0.0, 'train_loss': 0.3583272613286972, 'epoch': 2.0})

In [19]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1500
  Batch size = 16


EvalPrediction(predictions=array([[-3.4775002,  3.262797 ],
       [ 2.1396692, -1.3572104],
       [ 2.971143 , -2.2397625],
       ...,
       [-3.1341858,  3.1159618],
       [ 2.0713255, -1.444596 ],
       [-2.5953648,  2.612521 ]], dtype=float32), label_ids=array([1, 1, 0, ..., 1, 0, 1]))


{'test_loss': 0.3452337384223938,
 'test_accuracy': 0.9093333333333333,
 'test_f1': 0.9092122830440588,
 'test_precision': 0.8960526315789473,
 'test_recall': 0.9227642276422764,
 'test_runtime': 27.3771,
 'test_samples_per_second': 54.79,
 'test_steps_per_second': 3.434,
 'epoch': 2.0}

Add CLS-token embedding

In [20]:
model_with_cls = SentimentClassifier(n_classes=2, output_type='add_cls')
model_with_cls = model_with_cls.to(device)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache 

In [21]:
trainer = Trainer(
    model=model_with_cls,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,            
    compute_metrics = compute_metrics    
)

trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
10,0.6802
20,0.6878
30,0.7102
40,0.6883
50,0.6774
60,0.6956
70,0.6558
80,0.6036
90,0.5852
100,0.5422


Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./results/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1000, training_loss=0.3546455535292625, metrics={'train_runtime': 469.5688, 'train_samples_per_second': 17.037, 'train_steps_per_second': 2.13, 'total_flos': 0.0, 'train_loss': 0.3546455535292625, 'epoch': 2.0})

In [22]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1500
  Batch size = 16


EvalPrediction(predictions=array([[-2.8728597 ,  3.4979503 ],
       [-0.24992041,  0.20069085],
       [ 2.685428  , -3.3175306 ],
       ...,
       [-2.7161362 ,  3.3027742 ],
       [ 1.0207161 , -1.2998348 ],
       [-2.2120135 ,  2.776309  ]], dtype=float32), label_ids=array([1, 1, 0, ..., 1, 0, 1]))


{'test_loss': 0.3123458921909332,
 'test_accuracy': 0.9133333333333333,
 'test_f1': 0.9136786188579018,
 'test_precision': 0.8958333333333334,
 'test_recall': 0.9322493224932249,
 'test_runtime': 27.3932,
 'test_samples_per_second': 54.758,
 'test_steps_per_second': 3.432,
 'epoch': 2.0}

Task 4, aggregate CLS tokens

In [35]:
model_agg_cls = SentimentClassifier(n_classes=2, output_type='agg_cls')
model_agg_cls = model_agg_cls.to(device)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache 

In [36]:
trainer = Trainer(
    model=model_agg_cls,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,            
    compute_metrics = compute_metrics    
)

trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
10,0.6718
20,0.6573
30,0.6512
40,0.6696
50,0.6797
60,0.6636
70,0.6669
80,0.6186
90,0.6132
100,0.558


Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./results/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1000, training_loss=0.3403690246641636, metrics={'train_runtime': 469.1292, 'train_samples_per_second': 17.053, 'train_steps_per_second': 2.132, 'total_flos': 0.0, 'train_loss': 0.3403690246641636, 'epoch': 2.0})

In [37]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1500
  Batch size = 16


EvalPrediction(predictions=array([[-3.6609862 ,  3.4471428 ],
       [ 0.4066905 , -0.44584104],
       [ 3.0857577 , -2.9067018 ],
       ...,
       [-3.1954618 ,  2.962535  ],
       [ 1.1746558 , -1.2398993 ],
       [-1.4246486 ,  1.3980545 ]], dtype=float32), label_ids=array([1, 1, 0, ..., 1, 0, 1]))


{'test_loss': 0.3542517125606537,
 'test_accuracy': 0.9113333333333333,
 'test_f1': 0.9103169251517195,
 'test_precision': 0.9060402684563759,
 'test_recall': 0.9146341463414634,
 'test_runtime': 27.4206,
 'test_samples_per_second': 54.703,
 'test_steps_per_second': 3.428,
 'epoch': 2.0}

Try a BertForSequenceClassification model

In [25]:
model_for_classification = BertForSequenceClassification.from_pretrained("bert-base-uncased")

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache 

In [26]:
trainer = Trainer(
    model=model_for_classification,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,            
    compute_metrics = compute_metrics    
)

trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
10,0.7506
20,0.7043
30,0.7052
40,0.7092
50,0.6688
60,0.6717
70,0.6915
80,0.6531
90,0.6377
100,0.6211


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1000, training_loss=0.3725172662436962, metrics={'train_runtime': 470.3155, 'train_samples_per_second': 17.01, 'train_steps_per_second': 2.126, 'total_flos': 2104888442880000.0, 'train_loss': 0.3725172662436962, 'epoch': 2.0})

In [27]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1500
  Batch size = 16


EvalPrediction(predictions=array([[-3.0266879 ,  2.781048  ],
       [-0.30217725,  0.93050414],
       [ 2.9413102 , -1.2891498 ],
       ...,
       [-2.7786074 ,  2.7022812 ],
       [ 2.4949749 , -0.6932958 ],
       [-1.2734935 ,  1.7792877 ]], dtype=float32), label_ids=array([1, 1, 0, ..., 1, 0, 1]))


{'test_loss': 0.32249999046325684,
 'test_accuracy': 0.9113333333333333,
 'test_f1': 0.9105581708137189,
 'test_precision': 0.9038718291054739,
 'test_recall': 0.9173441734417345,
 'test_runtime': 27.402,
 'test_samples_per_second': 54.741,
 'test_steps_per_second': 3.43,
 'epoch': 2.0}

Take three sample reviews from IMDB and get predictions for them on the first model

In [28]:
#very positive Stranger Things review
st_review = '''
Superb series. I am generally not into science fiction, fantasy, supernatural or horror movies or TV series but this is different. Had me hooked from the start and never let go. 
Incredibly intriguing - the mystery surrounding Will, the girl's background and powers, how all these hang together and what forces are at work. 
Very well thought-out plot, superbly executed.

Very engaging too - many likeable characters, all given decent depth. The relationships between all the main characters and how these evolve make the series. 
If it wasn't for these it would be just another horror series.

Many of the concepts aren't overly new - I was reminded of The Cabin In The Woods, Stand By Me and Cloverfield - but the way everything is brought together is.

Good performances. David Harbour is excellent as Sherriff Hopper and the kids do incredibly well - good casting. 
Winona Ryder, the only big name in the cast (until Sean Astin and Paul Reiser appear in Season 2) is a bit irritating as Joyce Byers, 
though that might be the fault of her character (and thus the writers and directors). The hysteria is laid on a bit thick...

After an excellent Season 1, Seasons 2 and 3 are just as good. There's always a worry with any TV series that the writers run out of 
ideas but don't stop production, as the money is too good. So far it is still going strong, and the writers appear keen for it to end on a high.
'''

In [29]:
#very negative Goat Story review
gs_review = '''
It is an awful movie to watch, its feels so uncomfortable and its impossible to even imagine watching this with the Kids.

Its not an animation for the Kids and it has lots of deviated ideas like a goat's love for its owner moreover goat wants to marry him. 
Cutting sbs eyes is violent its so irresponsible to put the title"child animation"or "animation for children". It should be banned from everywhere.
'''

In [30]:
#ambigious Euphoria review (rated 8 stars, should be positive)
euph_review = '''
Sex drugs and more in high school. First a warning. Although this is a show about high school teenagers it's too much for teens to watch. 
Much more so than "13 reasons Why". There are parts when the scenes are much more explicit than necessary for the story. There isn't a boundary this show doesn't try to push for TV. 
Having said all that there are some fascinating and bizarre characters that keep things morbidly watchable for 8 episodes. 
A biracial lesbian drug addict, a transgender girl who is her bestie, a jock with demons, his closeted statutory rapist dad, the list goes on.
It's hard to keep track of the different characters and plot twists some tunes. Even if you don't agree with the suitability of the content here the acting is better than it has to be. 
Zendaya is convincing as an addict very natural acting. Jacob Elordi does the mean jock well.

Hate to say it but do want to see what happens in season 2.
'''

In [31]:
def get_prediction(review_text, labels):
    with torch.no_grad():
      encoded_review = tokenizer.encode_plus(
        review_text,
        max_length=512,
        add_special_tokens=True,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
      )

      input_ids = encoded_review['input_ids'].to(device)
      attention_mask = encoded_review['attention_mask'].to(device)

      output = model_agg_cls(input_ids, attention_mask, None, labels.to(device))
      prediction = torch.argmax(output[1], dim=1)
    
      print('predictied label: ', prediction.item())

In [32]:
get_prediction(st_review, labels=torch.LongTensor([1]))

predictied label:  1


In [33]:
get_prediction(gs_review, labels=torch.LongTensor([0]))

predictied label:  0


In [34]:
get_prediction(euph_review, labels=torch.LongTensor([0]))

predictied label:  1


All the predictions are right