In [1]:
import pandas as pd
import json

import torch.nn as nn
import torch
import torch.nn.functional as F

from tqdm.notebook import tqdm
from sklearn.utils import shuffle

import numpy as np
from transformers import BertModel, BertConfig, BertTokenizer, AdamW

In [3]:
reviews = []
data = open("../review_dataset/Industrial_and_Scientific_5.json")
for line in data.readlines():
    reviews.append(json.loads(line))

In [34]:
review_texts = []
review_scores = []

for sample in reviews:
    if 'reviewText' in sample and 'overall' in sample:
        review_texts.append(sample['reviewText'])
        if sample['overall'] >= 4:
            review_scores.append(1)
        else:
            review_scores.append(0)
            
train_reviews = review_texts[:len(review_texts)//2]
test_reviews = review_texts[len(review_texts)//2:]
train_scores = review_scores[:len(review_texts)//2]
test_scores = review_scores[len(review_texts)//2:]

In [37]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_train_reviews = tokenizer(train_reviews, return_tensors="pt", padding='max_length', truncation=True)
tokenized_test_reviews = tokenizer(test_reviews, return_tensors="pt", padding='max_length', truncation=True)

In [38]:
from torch.utils.data import Dataset, DataLoader
import random

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels
        
    def __len__(self):
        return len(self.tokens['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}
        out = {"tokens": item, "label": self.labels[idx]}
        return out

In [41]:
batch_size = 12

train_dataset = ReviewDataset(tokenized_train_reviews, train_scores)
test_dataset = ReviewDataset(tokenized_test_reviews, test_scores)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [43]:
class BertMLP(nn.Module):
  
    def __init__(self, num_labels=2, hidden_size=100, dropout_prob=.1):
        super(BertMLP, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-uncased')
#         self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, batch, return_embs = False):
        bert_emb = self.bert(input_ids = batch['input_ids'], 
           attention_mask=batch['attention_mask'], 
           token_type_ids=batch['token_type_ids']).last_hidden_state[:,0]
        logits = self.classifier(bert_emb)
        
        if return_embs: 
            return logits, bert_emb
        else:
            return logits
        
def tokens_to_cuda(tokens, device):
    dictionary = {}
    for key, value in tokens.items():
        dictionary[key] = value.to(device)
    return dictionary

In [52]:
device = 'cuda'
model = BertMLP()
model.to(device)

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": 0.00001},
        {"params":model.classifier.parameters(), "lr": 0.001},       
   ])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [53]:
epochs = 2

model.train()

for epoch in tqdm(range(epochs)):
    epoch_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        tokens = tokens_to_cuda(batch['tokens'], device)
        scores = model(tokens)    
                
        loss = loss_function(scores, batch['label'].to(device)) / batch_size

        epoch_loss += loss
        
        loss.backward()
        optimizer.step()
       
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(test_loader):
            tokens = tokens_to_cuda(batch['tokens'], device)
            scores = model(tokens)    
            loss = loss_function(scores, batch['label'].to(device)) / batch_size

            val_loss += loss
        
    print("End of Epoch", epoch)
    print("Training loss:", epoch_loss)
    print("Test loss:", val_loss)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3211 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}


  0%|          | 0/3211 [00:00<?, ?it/s]

End of Epoch 0
Training loss: tensor(46.9083, device='cuda:0', grad_fn=<AddBackward0>)
Test loss: tensor(47.2068, device='cuda:0')


  0%|          | 0/3211 [00:00<?, ?it/s]

  0%|          | 0/3211 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [58]:
model.eval()
preds = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        tokens = tokens_to_cuda(batch['tokens'], device)
        scores = F.softmax(model(tokens))
        preds.extend(torch.round(F.softmax(scores)[:,1]).to(torch.int64))

  0%|          | 0/3211 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}
  scores = F.softmax(model(tokens))
  preds.extend(torch.round(F.softmax(scores)[:,1]).to(torch.int64))
  right += torch.count_nonzero(batch['label'].to(device) == torch.round(F.softmax(scores)[:,1]).to(torch.int64))


In [59]:
from sklearn.metrics import f1_score

preds_cpu = [i.to('cpu') for i in preds]
print("End of training f1 score accuracy", f1_score(test_scores, preds_cpu))

End of training test accuracy tensor(0.9371, device='cuda:0')
