# Ratting prediction using BERT

In [12]:
import pandas as pd
import sklearn
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Creating data loader

In [14]:
class MyDataSet(Dataset):
    def __init__(self, example="train"):
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        if example == "train":
            dataset = pd.read_csv('../Kaggle-dataset/pre-processed/trainDataset.csv',encoding="latin1")
        elif example == "val":
            dataset = pd.read_csv('../Kaggle-dataset/pre-processed/valDataset.csv',encoding="latin1")
        else:
            dataset = pd.read_csv('../Kaggle-dataset/pre-processed/testDataset.csv',encoding="latin1")

        self.text_data = dataset["textFull"]
        self.tokens = self.tokenizer(list(dataset["textFull"]), padding = True, truncation=True)
        self.labels = dataset["rating"]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {}
        for k, v in self.tokens.items():
            item[k] = torch.tensor(v[idx]).to(device)
        item['labels'] = torch.tensor(self.labels[idx]).to(device) - 1
        return item

In [15]:
batch_size = 32
train_dataset = MyDataSet(example = "train")
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_dataset = MyDataSet(example = "val")
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)
test_dataset = MyDataSet(example = "test")
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

## Model

In [16]:
bert = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5).to(device)
bert.bert.requires_grad_(False)
bert.classifier.requires_grad_(True)
optimizer = torch.optim.AdamW(bert.parameters(), lr=1e-4) 
loss_fn = torch.nn.CrossEntropyLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from tqdm import tqdm
num_epochs = 4
for epoch in tqdm(range(num_epochs)):
    if epoch >= 2:
        print("Unfreezing layers")
        optimizer = torch.optim.AdamW(bert.parameters(), lr=1e-5)
        bert.bert.requires_grad_(True)
        bert.classifier.requires_grad_(True)
        
    print("Epoch: ",(epoch + 1), "----------------------------------")
    bert.train()
    total_loss_train = 0
    print(len((train_loader))," :", end=" ")

    for i,batch in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = bert(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        pred = outputs.logits
        loss = loss_fn(pred, batch['labels'])

        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()
        if i % 100 == 0:
            print(i, ":", total_loss_train, end=" ")
    print(f"\nTrain loss: {total_loss_train}")

    bert.eval()
    correct = 0
    total = 0
    total_loss_val = 0
    for i, batch in enumerate(val_loader):
        with torch.no_grad():
            outputs = bert(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        logits = outputs.logits
        loss = loss_fn(logits, batch['labels'])
        total_loss_val += loss.item()
        
        correct += (logits.argmax(1) == batch['labels']).sum().item()
        total += (logits.argmax(1) == logits.argmax(1)).sum().item()
    print(f"Val loss: ",total_loss_val)
    print(f"Val Acc: ",correct/total)

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch:  1 ----------------------------------
1700  : 0 : 1.7215142250061035 100 : 131.0129935145378 200 : 255.11851334571838 300 : 379.512457549572 400 : 503.1920762062073 500 : 628.201465010643 600 : 749.459127664566 700 : 871.3758177757263 800 : 989.6556088328362 900 : 1107.2049055099487 1000 : 1228.9010525941849 1100 : 1350.494466483593 1200 : 1472.497771203518 1300 : 1593.3632286787033 1400 : 1711.6744503974915 1500 : 1831.2689664959908 1600 : 1949.5232699513435 
Train loss: 2064.500599563122


 25%|██▌       | 1/4 [20:59<1:02:57, 1259.24s/it]

Val loss:  299.08642315864563
Val Acc:  0.5558960529541555
Epoch:  2 ----------------------------------
1700  : 0 : 1.1871343851089478 100 : 119.28984063863754 200 : 235.25229382514954 300 : 351.8604729771614 400 : 469.61310613155365 500 : 589.9338834881783 600 : 707.850201010704 700 : 826.0017397403717 800 : 940.9992127418518 900 : 1055.1705548763275 1000 : 1172.9520829319954 1100 : 1286.08813560009 1200 : 1401.047224342823 1300 : 1517.2572385668755 1400 : 1633.3195210695267 1500 : 1749.3205134868622 1600 : 1859.8151040673256 
Train loss: 1973.696773469448


 50%|█████     | 2/4 [42:15<42:17, 1268.98s/it]  

Val loss:  289.37465238571167
Val Acc:  0.5644765873988723
Unfreezing layers
Epoch:  3 ----------------------------------
1700  : 0 : 1.2591253519058228 100 : 96.14495965838432 200 : 178.54502138495445 300 : 253.49826389551163 400 : 328.7114703953266 500 : 403.01564890146255 600 : 474.2522301375866 700 : 539.0294333994389 800 : 609.4439779222012 900 : 679.3897665143013 1000 : 745.7661032378674 1100 : 815.6719637215137 1200 : 883.5239301025867 1300 : 951.8685435950756 1400 : 1018.9645284116268 1500 : 1084.6997320353985 1600 : 1151.3484317660332 
Train loss: 1216.8886517584324


 75%|███████▌  | 3/4 [1:37:47<36:51, 2211.44s/it]

Val loss:  162.5352668762207
Val Acc:  0.7632998283893111
Unfreezing layers
Epoch:  4 ----------------------------------
1700  : 0 : 0.5634540319442749 100 : 60.730956330895424 200 : 119.76999552547932 300 : 180.47433419525623 400 : 242.06087498366833 500 : 301.8812962025404 600 : 360.6072488874197 700 : 422.6696712821722 800 : 482.9623290747404 900 : 540.7539758831263 1000 : 600.9427126795053 1100 : 660.946772262454 1200 : 721.4128616452217 1300 : 781.1115184724331 1400 : 840.459117859602 1500 : 901.9782637059689 1600 : 960.2429176270962 
Train loss: 1017.5006348788738


100%|██████████| 4/4 [2:32:56<00:00, 2294.11s/it]

Val loss:  158.40258176624775
Val Acc:  0.7696739396911008





## Save Model

In [18]:
import pickle
with open('bert/bert.pkl','wb') as f:
    pickle.dump(bert,f)
with open('bert/tokenizer.pkl','wb') as f:
    pickle.dump(AutoTokenizer.from_pretrained("bert-base-cased"),f)

# Scores

In [21]:
def predict(bert, loader):
    Y = []
    Y_pred = []
    for i, batch in tqdm(enumerate(loader)):
        with torch.no_grad():
            Y_pred += bert.to(device)(input_ids = batch['input_ids'], attention_mask = batch['attention_mask']).logits.argmax(1).to('cpu')
            Y += batch['labels'].to('cpu')
    return Y,Y_pred


valDataSet = pd.read_csv('../Kaggle-dataset/pre-processed/valDataset.csv',encoding="latin1")
testDataSet = pd.read_csv('../Kaggle-dataset/pre-processed/testDataset.csv',encoding="latin1")
            
Y_val, Y_val_pred = predict(bert, val_loader)
Y_test, Y_test_pred = predict(bert, test_loader)


0it [00:00, ?it/s]

255it [02:42,  1.57it/s]
170it [01:47,  1.58it/s]


In [22]:
acc_val = sklearn.metrics.accuracy_score(Y_val, Y_val_pred)
f1_val = sklearn.metrics.f1_score(Y_val, Y_val_pred, average=None)
acc_test = sklearn.metrics.accuracy_score(Y_test, Y_test_pred)
f1_test = sklearn.metrics.f1_score(Y_test, Y_test_pred, average=None)

print("Val")
print("\tAcuracia: ", acc_val)
print("\tF1: ", f1_val)

print("Test")
print("\tAcuracia: ", acc_test)
print("\tF1: ", f1_test)

Val
	Acuracia:  0.7696739396911008
	F1:  [0.79933481 0.275      0.42929293 0.40878828 0.89294093]
Test
	Acuracia:  0.7676470588235295
	F1:  [0.77685226 0.23369565 0.40897098 0.43112701 0.89724626]
