In [1]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm import tqdm

In [2]:
df = pd.read_csv("data/IMDB Dataset.csv")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:

def to_sentiment(sentiment):
    #print(sentiment)
    if sentiment == "positive":
        return 1
    else:
        return 0

df['sentiment'] = df.sentiment.apply(to_sentiment)

In [5]:
print(len(df.review[0]))

df

1761


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [6]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [7]:
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [8]:
tokens = tokenizer.tokenize(df.review[0])
token_ids = tokenizer.convert_tokens_to_ids(tokens)

encoding = tokenizer.encode_plus(
    df.review[0],
    max_length=512,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding=True,
    return_attention_mask=True,
    return_tensors="pt"
    )
#print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))\
#encoding['attention_mask']

In [9]:
"""
token_len = []
#encoding = tokenizer.encode_plus(df.review[0],max_length=512)

#print(encoding)
for txt in tqdm(df.review):
    tokens = tokenizer.encode(txt, max_length=512)
    token_len.append(len(tokens))
    
#token_len
"""

'\ntoken_len = []\n#encoding = tokenizer.encode_plus(df.review[0],max_length=512)\n\n#print(encoding)\nfor txt in tqdm(df.review):\n    tokens = tokenizer.encode(txt, max_length=512)\n    token_len.append(len(tokens))\n    \n#token_len\n'

In [10]:
#sns.distplot(token_len)
#plt.xlim([0, 512]);
#plt.xlabel('Token count');

In [11]:
MAX_LEN = 256

In [12]:
class IMDBReview(Dataset):
    
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        target = self.targets[idx]
        
        encoding = tokenizer.encode_plus(
            review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
        )
        return {
            'review_text' : review,
            'input_ids' : encoding['input_ids'].flatten(),
            'attention_mask' : encoding['attention_mask'].flatten(), 
            'target' : torch.tensor(target, dtype=torch.long)
        }
   

      

In [13]:
class GPReviewDataset(Dataset):

  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.reviews)
  
  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]
    print(type(target))
    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [14]:
RANDOM_SEED = 42
df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

In [15]:
df_test.shape, df_train.shape, df_val.shape

((2500, 2), (45000, 2), (2500, 2))

In [16]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = IMDBReview(
        reviews = df.review.to_numpy(),
        targets = df.sentiment.to_numpy(), 
        tokenizer = tokenizer,
        max_len = max_len
    )
    
    return DataLoader(
      ds, 
      batch_size=batch_size, 
      num_workers = 4
     )

In [17]:
BATCH_SIZE = 16


train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [18]:
print(train_data_loader)
data = next(iter(train_data_loader))
data.keys()

<torch.utils.data.dataloader.DataLoader object at 0x7f01a6f8c9d0>


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

dict_keys(['review_text', 'input_ids', 'attention_mask', 'target'])

In [19]:
print(data.keys())

dict_keys(['review_text', 'input_ids', 'attention_mask', 'target'])


In [20]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)

In [21]:
last_hidden_state, pooled_output = bert_model(
    input_ids=encoding['input_ids'], 
    attention_mask=encoding['attention_mask'])

In [22]:
last_hidden_state

tensor([[[ 0.5012,  0.0094, -0.2136,  ..., -0.1368,  0.0896,  0.1009],
         [ 0.5462, -0.6505,  0.3598,  ..., -0.3123,  0.5764,  0.1291],
         [ 0.2375, -0.6279,  0.0104,  ..., -0.0887,  0.2559, -0.0110],
         ...,
         [ 0.7251, -0.7127, -0.1225,  ..., -0.2621,  0.3136, -0.3228],
         [ 0.4324,  0.0997,  0.2295,  ...,  0.4003, -0.0422,  0.2122],
         [ 1.1102,  0.4780, -0.1035,  ...,  1.0135, -0.3847,  0.1727]]],
       grad_fn=<NativeLayerNormBackward>)

In [23]:
bert_model.config.hidden_size

768

In [24]:
pooled_output.shape

torch.Size([1, 768])

In [25]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [26]:
#temp
device = torch.device("cuda:0")
class_names = ["positive", "negative"]
model = SentimentClassifier(len(class_names))
model = model.to(device)


#from apex.parallel import DistributedDataParallel as DDP
#model = DDP(model)

n_gpu = torch.cuda.device_count()
if n_gpu > 1:
    print("Number of GPU is ", n_gpu)
    model = torch.nn.DataParallel(model)
print(model)


Number of GPU is  2
DataParallel(
  (module): SentimentClassifier(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768

In [27]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape)
print(attention_mask.shape)

torch.Size([16, 256])
torch.Size([16, 256])


In [28]:
torch.cuda.empty_cache()
F.softmax(model(input_ids, attention_mask), dim=1)

tensor([[0.3046, 0.6954],
        [0.3393, 0.6607],
        [0.3067, 0.6933],
        [0.3235, 0.6765],
        [0.2547, 0.7453],
        [0.3784, 0.6216],
        [0.3589, 0.6411],
        [0.1736, 0.8264],
        [0.2308, 0.7692],
        [0.3626, 0.6374],
        [0.3436, 0.6564],
        [0.1594, 0.8406],
        [0.1786, 0.8214],
        [0.3419, 0.6581],
        [0.3930, 0.6070],
        [0.2765, 0.7235]], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [29]:
EPOCHS = 3
#what is correct bias
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
#scheduler has linear relationship with the total number of steps and drops lr to 0 at end
scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=0,# lr goes from 0 to the initial value
                num_training_steps=total_steps
            )

loss_function = nn.CrossEntropyLoss().to(device)



In [30]:
def train(
    model, 
    data_loader, 
    loss_function, 
    optimizer, 
    device, 
    scheduler, 
    n_examples
):
    model = model.train()
    
    losses = []
    correct_pred = 0
    
    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        targets = data['target'].to(device)
        
        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_mask
                       )
        _, preds= torch.max(outputs, dim=1)
        loss = loss_function(outputs, targets)
        
        correct_pred += torch.sum(preds == targets)
        #what does loss item return???
        losses.append(loss.item())
        
        loss.backward()
        #understand this line
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_pred.double() / n_examples, np.mean(losses)
        

In [31]:
def eval(model, 
        data_loader, 
        loss_function, 
        device, n_examples
        ):
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for data in data_loader:
            
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            targets = data['target'].to(device)

            outputs = model(input_ids=input_ids, 
                        attention_mask=attention_mask
                       )
            _, preds= torch.max(outputs, dim=1)
            loss = loss_function(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            #what does loss item return???
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)
            

In [None]:
%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print("Epoch - ", epoch + 1)
    torch.cuda.empty_cache()

    train_accuracy, train_loss = train(
        model, train_data_loader, 
        loss_function, 
        optimizer, 
        device, 
        scheduler, 
        len(df_train)
    )
    torch.cuda.empty_cache()
    print("train done-- starting validation")
    validation_accuracy, validation_loss = eval(
        model,
        val_data_loader, 
        loss_function, 
        device, 
        len(df_val)
    )
    
    print(f'Val   loss {validation_loss} accuracy {validation_accuracy}')
    
    history['train_acc'].append(train_accuracy)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(validation_accuracy)
    history['val_loss'].append(validation_loss)
    torch.cuda.empty_cache()
    
    if validation_accuracy> best_accuracy:
        #torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = validation_accuracy
        
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            ...
            }, PATH)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs
Epoch -  1


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

train done-- starting validation


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Val   loss 0.2415132091920467 accuracy 0.908
Epoch -  2


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

In [66]:
#plotting
model = torch.load('best_model_state.bin')

In [67]:
test_accuracy, _ = eval(model,
                        test_data_loader, 
                        loss_function, 
                        device,
                        len(df_test)
                       )

test_accuracy.item()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

TypeError: 'collections.OrderedDict' object is not callable

In [32]:
device = torch.device("cuda:0")
class_names = ["positive", "negative"]
model = SentimentClassifier(len(class_names))
model = model.to(device)


#from apex.parallel import DistributedDataParallel as DDP
#model = DDP(model)

n_gpu = torch.cuda.device_count()
if n_gpu > 1:
    print("Number of GPU is ", n_gpu)
    model = torch.nn.DataParallel(model)
#print(model)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

PATH = 'best_model_state.bin'
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = 3 #checkpoint['epoch']
loss = checkpoint['loss']

Number of GPU is  2


In [35]:
#Predict on Raw Text

text = "The movie ending is the worst, but overall good"

encoded_review = tokenizer.encode_plus(
    text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
)



In [38]:
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)

print("Input Text ", text)
print(f'Sentiment : {class_names[prediction]}')

Input Text  The movie ending is good, but the starting was not up to tha mark
Sentiment : positive
