# Ratting prediction using BERT

In [31]:
import pandas as pd
import sklearn
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer

In [32]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Creating data loader

In [None]:
class MyDataSet(Dataset):
    def __init__(self, example="train"):
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        if example == "train":
            dataset = pd.read_csv('../Kaggle-dataset/pre-processed/trainDataset.csv',encoding="latin1")
        elif example == "val":
            dataset = pd.read_csv('../Kaggle-dataset/pre-processed/valDataset.csv',encoding="latin1")
        else:
            dataset = pd.read_csv('../Kaggle-dataset/pre-processed/testDataset.csv',encoding="latin1")

        self.text_data = dataset["textFull"]
        self.tokens = self.tokenizer(list(dataset["textFull"]), padding = True, truncation=True)
        self.labels = dataset["rating"]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {}
        for k, v in self.tokens.items():
            item[k] = torch.tensor(v[idx]).to(device)
        item['labels'] = torch.tensor(self.labels[idx]).to(device) - 1
        return item

In [34]:
batch_size = 32
train_dataset = MyDataSet(example = "train")
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_dataset = MyDataSet(example = "val")
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)
test_dataset = MyDataSet(example = "test")
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

## Model

In [35]:
bert = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5).to(device)
bert.bert.requires_grad_(False)
bert.classifier.requires_grad_(True)
optimizer = torch.optim.AdamW(bert.parameters(), lr=1e-4) 
loss_fn = torch.nn.CrossEntropyLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from tqdm import tqdm
num_epochs = 4
for epoch in tqdm(range(num_epochs)):
    if epoch >= 2:
        print("Unfreezing layers")
        optimizer = torch.optim.AdamW(bert.parameters(), lr=1e-5)
        bert.bert.requires_grad_(True)
        bert.classifier.requires_grad_(True)
        
    print("Epoch: ",(epoch + 1), "----------------------------------")
    bert.train()
    total_loss_train = 0
    print(len((train_loader))," :", end=" ")

    for i,batch in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = bert(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        pred = outputs.logits
        loss = loss_fn(pred, batch['labels'])

        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()
        if i % 100 == 0:
            print(i, ":", total_loss_train, end=" ")
    print(f"\nTrain loss: {total_loss_train}")

    bert.eval()
    correct = 0
    total = 0
    total_loss_val = 0
    for i, batch in enumerate(val_loader):
        with torch.no_grad():
            outputs = bert(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        logits = outputs.logits
        loss = loss_fn(logits, batch['labels'])
        total_loss_val += loss.item()
        
        correct += (logits.argmax(1) == batch['labels']).sum().item()
        total += (logits.argmax(1) == logits.argmax(1)).sum().item()
    print(f"Val loss: ",total_loss_val)
    print(f"Val Acc: ",correct/total)

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch:  1 ----------------------------------
1700  : 0 : 1.8335634469985962 100 : 132.05991274118423 200 : 256.4970574975014 300 : 381.1644026041031 400 : 503.49818456172943 500 : 627.6555376648903 600 : 748.6356809735298 700 : 869.3627625107765 800 : 991.6781249046326 900 : 1114.1847217082977 1000 : 1235.51602435112 1100 : 1355.6421993374825 1200 : 1474.2077613472939 1300 : 1591.0556537508965 1400 : 1708.5779254436493 1500 : 1828.6925369501114 1600 : 1944.4770182967186 
Train loss: 2063.0047773718834


 25%|██▌       | 1/4 [21:06<1:03:20, 1266.94s/it]

Val loss:  298.36660146713257
Val Acc:  0.5505025741603334
Epoch:  2 ----------------------------------
1700  : 0 : 1.2560410499572754 100 : 115.96689856052399 200 : 232.16897064447403 300 : 348.0586379170418 400 : 467.41283309459686 500 : 587.765111386776 600 : 704.4739812612534 700 : 820.5711499452591 800 : 936.3104489445686 900 : 1053.842625796795 1000 : 1170.2942896485329 1100 : 1284.0791102051735 1200 : 1397.992063820362 1300 : 1513.4232644438744 1400 : 1628.5628224611282 1500 : 1743.8629182577133 1600 : 1857.6377017498016 
Train loss: 1971.0829531550407


 50%|█████     | 2/4 [42:13<42:13, 1266.61s/it]  

Val loss:  288.9850385785103
Val Acc:  0.5878891885265997
Unfreezing layers
Epoch:  3 ----------------------------------
1700  : 0 : 0.9096465110778809 100 : 95.97625014185905 200 : 177.66485515236855 300 : 252.034624427557 400 : 325.9691686630249 500 : 396.1013750731945 600 : 469.35477340221405 700 : 538.5543161034584 800 : 608.3447819948196 900 : 679.7245073318481 1000 : 745.9303385019302 1100 : 815.500756919384 1200 : 884.1310867369175 1300 : 950.240194439888 1400 : 1017.9852719604969 1500 : 1086.087211072445 1600 : 1150.698146611452 
Train loss: 1214.8270719349384


 75%|███████▌  | 3/4 [1:37:41<36:47, 2207.80s/it]

Val loss:  162.9590938091278
Val Acc:  0.7623191958813434
Unfreezing layers
Epoch:  4 ----------------------------------
1700  : 0 : 0.49026045203208923 100 : 59.17432424426079 200 : 120.14272654056549 300 : 177.7812306135893 400 : 238.1061608940363 500 : 297.23331908881664 600 : 357.74955417215824 700 : 416.6139647513628 800 : 479.3064527362585 900 : 538.2805634588003 1000 : 597.270447358489 1100 : 655.2973441332579 1200 : 715.798188611865 1300 : 776.0568182766438 1400 : 839.7777493596077 1500 : 900.3527182936668 1600 : 959.1219072341919 
Train loss: 1012.5034868568182


100%|██████████| 4/4 [2:33:02<00:00, 2295.65s/it]

Val loss:  160.6129099279642
Val Acc:  0.7733513115959794





## Save Model

In [41]:
import pickle
with open('bert/bert.pkl','wb') as f:
    pickle.dump(bert,f)
with open('bert/tokenizer.pkl','wb') as f:
    pickle.dump(AutoTokenizer.from_pretrained("bert-base-cased"),f)

# Scores

In [None]:
def predict(bert, loader):
    Y = []
    Y_pred = []
    for i, batch in tqdm(enumerate(loader)):
        with torch.no_grad():
            Y_pred += bert.to(device)(input_ids = batch['input_ids'], attention_mask = batch['attention_mask']).logits.argmax(1).to('cpu')
            Y += batch['labels'].to('cpu')
        break
    return Y,Y_pred


valDataSet = pd.read_csv('../Kaggle-dataset/pre-processed/valDataset.csv',encoding="latin1")
testDataSet = pd.read_csv('../Kaggle-dataset/pre-processed/testDataset.csv',encoding="latin1")
            
Y_val, Y_val_pred = predict(bert, val_loader)
Y_test, Y_test_pred = predict(bert, test_loader)


12it [00:07,  1.62it/s]

In [None]:
acc_val = sklearn.metrics.accuracy_score(Y_val, Y_val_pred)
f1_val = sklearn.metrics.f1_score(Y_val, Y_val_pred, average=None)
acc_test = sklearn.metrics.accuracy_score(Y_test, Y_test_pred)
f1_test = sklearn.metrics.f1_score(Y_test, Y_test_pred, average=None)

print("Val")
print("\tAcuracia: ", acc_val)
print("\tF1: ", f1_val)

print("Test")
print("\tAcuracia: ", acc_test)
print("\tF1: ", f1_test)

Val
	Acuracia:  0.39801421917136554
	F1:  [0.2294068  0.0224     0.0438247  0.10458284 0.5772168 ]
Test
	Acuracia:  0.39650735294117645
	F1:  [0.18214286 0.03296703 0.07328244 0.11945117 0.57944218]


In [40]:
# example = 33
# print(testDataSet.iloc[example]["textFull"])
# print( "Original: ", Y_test[example])
# print("Predict: ", Y_test_pred[example])