In [1]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import torch 


In [2]:
df = pd.read_csv("dataset/imdb_dataset.csv")
df.head()
df['sentiment'].value_counts()

df0 = df[df['sentiment'] == 'negative'].sample(frac=.1)
df1 = df[df['sentiment'] == 'positive'].sample(frac=.1)

df = pd.concat([df0, df1], axis=0)
df.count()
df.head()

Unnamed: 0,review,sentiment
4945,"The Bermuda Triangle ,we are told in this wast...",negative
35698,A somewhat awkward spy mystery with a predicta...,negative
2841,"This was just horrible the plot was just OK, b...",negative
34542,When one considers that Carson McCullers is on...,negative
49632,This movie was so bad it was laughable. I coul...,negative


In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', lower=True)

def preprocess_function(exp):
    return tokenizer(exp, truncation=True)

In [4]:
sentiment_alias = {'positive': 1, 'negative': 0}
df['sentiment'] = df['sentiment'].map(sentiment_alias)
df.head()

Unnamed: 0,review,sentiment
4945,"The Bermuda Triangle ,we are told in this wast...",0
35698,A somewhat awkward spy mystery with a predicta...,0
2841,"This was just horrible the plot was just OK, b...",0
34542,When one considers that Carson McCullers is on...,0
49632,This movie was so bad it was laughable. I coul...,0


In [5]:
from sklearn.model_selection import train_test_split

x_tr, x_val, y_tr, y_val = train_test_split(df.index.values,
                                            df.sentiment.values,
                                            test_size=0.15,
                                            random_state=42, 
                                            stratify=df.sentiment.values)

df['data_type'] = ['not_set']*df.shape[0]

In [6]:
df.loc[x_tr, 'data_type'] = 'train'
df.loc[x_val, 'data_type'] = 'val'

df.groupby(['sentiment', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,review
sentiment,data_type,Unnamed: 2_level_1
0,train,2125
0,val,375
1,train,2125
1,val,375


In [7]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
encoded_data_train = tokenizer.batch_encode_plus(
   df[df.data_type == 'train'].review.values,
   add_special_tokens=True,
   return_attention_mask=True,
   pad_to_max_length=True,
   max_length=256,
   return_tensors="pt"
)

encoded_data_val = tokenizer.batch_encode_plus(
   df[df.data_type == 'val'].review.values,
   add_special_tokens=True,
   return_attention_mask=True,
   pad_to_max_length=True,
   max_length=256,
   return_tensors="pt"
)

input_ids_train = encoded_data_train['input_ids']
attention_mask_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].sentiment.values)

input_ids_val = encoded_data_val['input_ids']
attention_mask_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type == 'val'].sentiment.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [9]:
dataset_train = TensorDataset(input_ids_train, attention_mask_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_mask_val, labels_val)


In [10]:
from transformers import BertForSequenceClassification

sentichan = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                          num_labels=2,
                                                          output_attentions=False,
                                                          output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 10

dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
dataloader_val = DataLoader(dataset_val, sampler=RandomSampler(dataset_val), batch_size=batch_size)

In [12]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(sentichan.parameters(), lr=1e-5, eps=1e-8)

epochs = 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(dataloader_train)*epochs)



In [13]:
import numpy as np
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {1: 'positive', 0: 'negative'}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [14]:
import random


seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)


device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
sentichan.to(device)
print("running on", device)

running on cuda


In [15]:
def evaluate(dataloader_val):
    sentichan.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader_val):
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2],
        }

        with torch.no_grad():
            outputs = sentichan(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [16]:
import gc

for epoch in tqdm(range(1, epochs+1)):
   sentichan.train()
   loss_train_total = 0

   progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
   
   for batch in progress_bar:
      sentichan.zero_grad()
      batch = tuple(b.to(device) for b in batch)
      inputs ={
         'input_ids': batch[0],
         'attention_mask': batch[1],
         'labels': batch[2]
      }

      outputs = sentichan(**inputs)
      loss = outputs[0]
      loss_train_total += loss.item()
      loss.backward()

      torch.nn.utils.clip_grad_norm_(sentichan.parameters(), 1.0)

      optimizer.step()
      scheduler.step()

      progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

   tqdm.write('\n Epoch {epoch}')

   loss_train_avg = loss_train_total/len(dataloader_train)
   tqdm.write(f'Training loss: {loss_train_avg}')

   val_loss, predictions, true_vals = evaluate(dataloader_val)
   val_f1 = f1_score_func(predictions, true_vals)
   tqdm.write(f'Validation loss: {val_loss}')
   tqdm.write(f'F1 Score (weighted): {val_f1}')
   torch.cuda.empty_cache()
   gc.collect()

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/425 [00:00<?, ?it/s]


 Epoch {epoch}
Training loss: 0.38946873334400794


  0%|          | 0/75 [00:00<?, ?it/s]

Validation loss: 0.24269882713754973
F1 Score (weighted): 0.9146278128357084


Epoch 2:   0%|          | 0/425 [00:00<?, ?it/s]


 Epoch {epoch}
Training loss: 0.24267948211335083


  0%|          | 0/75 [00:00<?, ?it/s]

Validation loss: 0.27086233525226516
F1 Score (weighted): 0.9146642393383634


Epoch 3:   0%|          | 0/425 [00:00<?, ?it/s]


 Epoch {epoch}
Training loss: 0.156693429750226


  0%|          | 0/75 [00:00<?, ?it/s]

Validation loss: 0.36365657944232227
F1 Score (weighted): 0.9185901051833636


Epoch 4:   0%|          | 0/425 [00:00<?, ?it/s]


 Epoch {epoch}
Training loss: 0.08497372012694969


  0%|          | 0/75 [00:00<?, ?it/s]

Validation loss: 0.3989226032048464
F1 Score (weighted): 0.911994367639529


Epoch 5:   0%|          | 0/425 [00:00<?, ?it/s]


 Epoch {epoch}
Training loss: 0.044496999832609775


  0%|          | 0/75 [00:00<?, ?it/s]

Validation loss: 0.4386658823979087
F1 Score (weighted): 0.9173239266334303


Epoch 6:   0%|          | 0/425 [00:00<?, ?it/s]


 Epoch {epoch}
Training loss: 0.03659700005486443


  0%|          | 0/75 [00:00<?, ?it/s]

Validation loss: 0.46487797975113304
F1 Score (weighted): 0.9173327454773011


Epoch 7:   0%|          | 0/425 [00:00<?, ?it/s]


 Epoch {epoch}
Training loss: 0.022094304579830564


  0%|          | 0/75 [00:00<?, ?it/s]

Validation loss: 0.5077714692843923
F1 Score (weighted): 0.9239511935660042


Epoch 8:   0%|          | 0/425 [00:00<?, ?it/s]


 Epoch {epoch}
Training loss: 0.011812227612279137


  0%|          | 0/75 [00:00<?, ?it/s]

Validation loss: 0.5303465151181445
F1 Score (weighted): 0.9185761957730812


Epoch 9:   0%|          | 0/425 [00:00<?, ?it/s]


 Epoch {epoch}
Training loss: 0.005065673003863434


  0%|          | 0/75 [00:00<?, ?it/s]

Validation loss: 0.5344864397150619
F1 Score (weighted): 0.9199311051642659


Epoch 10:   0%|          | 0/425 [00:00<?, ?it/s]


 Epoch {epoch}
Training loss: 0.004699871873325559


  0%|          | 0/75 [00:00<?, ?it/s]

Validation loss: 0.5064046784977351
F1 Score (weighted): 0.9239987839805437


In [17]:
accuracy_per_class(predictions, true_vals)

Class: negative
Accuracy: 345/375

Class: positive
Accuracy: 348/375



In [None]:
torch.save(sentichan.state_dict(), "sentichan_v2_state.pt")
torch.save(sentichan, "sentichan_v2_entire.pt")