# Ratting prediction using SVM and Embeddings

In [1]:
import pandas as pd
import sklearn
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer

## Creating data loader

In [2]:
class MyDataSet(Dataset):
    def __init__(self, example="train"):
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        if example == "train":
            dataset = pd.read_csv('../Kaggle-dataset/pre-processed/trainDataset.csv',encoding="latin1")
        elif example == "val":
            dataset = pd.read_csv('../Kaggle-dataset/pre-processed/valDataset.csv',encoding="latin1")
        else:
            dataset = pd.read_csv('../Kaggle-dataset/pre-processed/testDataset.csv',encoding="latin1")

        self.text_data = dataset["textFull"]
        self.tokens = self.tokenizer(list(dataset["textFull"]), padding = True, truncation=True)
        self.labels = dataset["rating"]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {}
        for k, v in self.tokens.items():
            item[k] = torch.tensor(v[idx])
        item['labels'] = torch.tensor(self.labels[idx]) - 1
        return item

In [3]:
batch_size = 32
train_dataset = MyDataSet(example = "train")
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_dataset = MyDataSet(example = "val")
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)
test_dataset = MyDataSet(example = "test")
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

## Model

In [12]:
bert = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)
bert.bert.requires_grad_(False)
bert.classifier.requires_grad_(True)
optimizer = torch.optim.AdamW(bert.parameters(), lr=1e-5) 
loss_fn = torch.nn.CrossEntropyLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    print("Epoch: ",(epoch + 1), "----------------------------------")
    bert.train()
    total_loss_train = 0
    for i,batch in enumerate(train_loader):     
        print(i, len(enumerate(train_loader)))
       
        optimizer.zero_grad()
        outputs = bert(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        pred = outputs.logits
        loss = loss_fn(pred, batch['labels'])

        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()
    print(f"Train loss: {total_loss_train}")

    bert.eval()
    correct = 0
    total = 0
    total_loss_val = 0
    for i, batch in enumerate(val_loader):
        with torch.no_grad():
            outputs = bert(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        logits = outputs.logits
        loss = loss_fn(logits, batch['labels'])
        total_loss_val += loss.item()
        
        correct += (logits.argmax(1) == batch['labels']).sum().item()
        total += (logits.argmax(1) == logits.argmax(1)).sum().item()
    print(f"Val loss: ",total_loss_val)
    print(f"Val Acuracy: ",correct/total)

Epoch:  1 ----------------------------------
0


KeyboardInterrupt: 

## Save Model

In [25]:
import pickle
with open('bert/bert.pkl','wb') as f:
    pickle.dump(bert,f)
with open('bert/tokenizer.pkl','wb') as f:
    pickle.dump(AutoTokenizer.from_pretrained("bert-base-cased"),f)

# Scores

In [26]:
def predict(embedding, svm, X):
    X_embedding = [embedding(x).vector for x in X]
    return svm.predict(X_embedding)

Y_val = valDataSet["rating"]
Y_val_pred = predict(embedding, svm, valDataSet["textFull"])
Y_test = testDataSet["rating"]
Y_test_pred = predict(embedding, svm, testDataSet["textFull"])


In [27]:
acc_val = sklearn.metrics.accuracy_score(Y_val, Y_val_pred)
f1_val = sklearn.metrics.f1_score(Y_val, Y_val_pred, average=None)
acc_test = sklearn.metrics.accuracy_score(Y_test, Y_test_pred)
f1_test = sklearn.metrics.f1_score(Y_test, Y_test_pred, average=None)

print("Val")
print("\tAcuracia: ", acc_val)
print("\tF1: ", f1_val)

print("Test")
print("\tAcuracia: ", acc_test)
print("\tF1: ", f1_test)

Val
	Acuracia:  0.6518754596714881
	F1:  [0.59674389 0.04761905 0.03826087 0.13442325 0.78011332]
Test
	Acuracia:  0.659375
	F1:  [0.5943304  0.02083333 0.03743316 0.15555556 0.78945939]


In [31]:
example = 33
print(testDataSet.iloc[example]["textFull"])
print( "Original: ", Y_test[example])
print("Predict: ", Y_test_pred[example])

please read amazing phone wish couldve used really research phone wa excited purchase made amazon unfortunately next day received phone thats nightmare started phone looked brand new sealed packaging like would purchase store example literally use scissors cut packing even get provided book cut complete different package even touch phone went proper procedure activating phone purchasing bundle plan use phone wa completely done programming customer service rep tell make call error message stated phone authenticated happened february 27th today march 3 2014 learned return phone amazon tthis review customer beware purpose amazon verizon wireless excellent customer service tried hard fix issue crazy part learned tonight phone purchased actually hacked never able use granted wa upset almost week late night verizon rep wa getting old trying million thing repeatedly try fix phone tonight verizon rep told straight maam need return phone somebody ha hacked phone resold whatever site bought wa s