# 1. Loading the dataset

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = "./datasets/train.txt"
train = pd.read_csv(file_path, sep='\t', header=None, names=['LABEL', 'REVIEW'])
train

Unnamed: 0,LABEL,REVIEW
0,TRUTHFULPOSITIVE,The sheraton was a wonderful hotel! When me an...
1,TRUTHFULPOSITIVE,We stayed at the Omni between Christmas and Ne...
2,DECEPTIVENEGATIVE,I was REALLY looking forward to a nice relaxin...
3,TRUTHFULNEGATIVE,"First let me say, I try not to be too critical..."
4,DECEPTIVENEGATIVE,The Ambassador East Hotel is a terrible place ...
...,...,...
1395,TRUTHFULNEGATIVE,I stayed here for 5 nights last summer. I book...
1396,TRUTHFULPOSITIVE,Stayed here for 3 nights for a Bridgestone/Fir...
1397,TRUTHFULNEGATIVE,I am staying here now and actually am compelle...
1398,TRUTHFULNEGATIVE,We stayed at this hotel with our two teenage d...


In [3]:
file_path = "./datasets/test_just_reviews.txt"
test = pd.read_csv(file_path, sep='\t', header=None, names=['REVIEW'])
test

Unnamed: 0,REVIEW
0,My family and I stayed here while we were visi...
1,WARNING! My stay at the Talbott Hotel will go ...
2,I recently stayed at the Hard Rock Hotel in Ch...
3,O.M.G best hotel ever ! i've stayed at various...
4,We became an Ambassador member just before spe...
...,...
195,The Millennium Knickerbocker Hotel has seen be...
196,We got a spanking deal at this hotel for $99 a...
197,Just back from a business trip. The Homewood i...
198,I have just returned from a lovely shopping tr...


In [4]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
labels = encoder.fit_transform(train['LABEL'].values)
labels = labels.astype(float)

In [5]:
from sklearn.model_selection import train_test_split

train_reviews, val_reviews, train_labels, val_labels = train_test_split(train['REVIEW'].values, labels, test_size=0.2, random_state=42)
train_reviews = list(train_reviews)
val_reviews = list(val_reviews)

In [6]:
# Tokenization and Padding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_train_inputs = tokenizer(train_reviews, padding=True, truncation=True, return_tensors='pt', max_length=256)
encoded_val_inputs = tokenizer(val_reviews, padding=True, truncation=True, return_tensors='pt', max_length=256)

In [7]:
train_labels = torch.tensor(train_labels, dtype=torch.long)
val_labels = torch.tensor(val_labels, dtype=torch.long)

# Create DataLoader for training and testing data
train_dataset = TensorDataset(encoded_train_inputs.input_ids, encoded_train_inputs.attention_mask, torch.tensor(train_labels))
val_dataset = TensorDataset(encoded_val_inputs.input_ids, encoded_val_inputs.attention_mask, torch.tensor(val_labels))

train_data_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_data_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)


In [8]:
# Model and Optimizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
optimizer = AdamW(model.parameters(), lr=1e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Training
num_epochs = 3  # You might need to adjust this based on your dataset and computational resources
for epoch in range(num_epochs):
    print("Epoch:", epoch)
    for i, batch in enumerate(train_data_loader):
        print("Batch:", i, "/", len(train_data_loader))
        input_ids, attention_mask, label = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Epoch: 0
Batch: 0 / 560
Batch: 1 / 560
Batch: 2 / 560
Batch: 3 / 560
Batch: 4 / 560
Batch: 5 / 560
Batch: 6 / 560
Batch: 7 / 560
Batch: 8 / 560
Batch: 9 / 560
Batch: 10 / 560
Batch: 11 / 560
Batch: 12 / 560
Batch: 13 / 560
Batch: 14 / 560
Batch: 15 / 560
Batch: 16 / 560
Batch: 17 / 560
Batch: 18 / 560
Batch: 19 / 560
Batch: 20 / 560
Batch: 21 / 560
Batch: 22 / 560
Batch: 23 / 560
Batch: 24 / 560
Batch: 25 / 560
Batch: 26 / 560
Batch: 27 / 560
Batch: 28 / 560
Batch: 29 / 560
Batch: 30 / 560
Batch: 31 / 560
Batch: 32 / 560
Batch: 33 / 560
Batch: 34 / 560
Batch: 35 / 560
Batch: 36 / 560
Batch: 37 / 560
Batch: 38 / 560
Batch: 39 / 560
Batch: 40 / 560
Batch: 41 / 560
Batch: 42 / 560
Batch: 43 / 560
Batch: 44 / 560
Batch: 45 / 560
Batch: 46 / 560
Batch: 47 / 560
Batch: 48 / 560
Batch: 49 / 560
Batch: 50 / 560
Batch: 51 / 560
Batch: 52 / 560
Batch: 53 / 560
Batch: 54 / 560
Batch: 55 / 560
Batch: 56 / 560
Batch: 57 / 560
Batch: 58 / 560
Batch: 59 / 560
Batch: 60 / 560
Batch: 61 / 560
Batch: 62

In [10]:
# Evaluation
correct = 0
total = 0
with torch.no_grad():
    for batch in test_data_loader:
        input_ids, attention_mask, label = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += label.size(0)
        correct += (predicted == label).sum().item()

accuracy = correct / total
print('Accuracy: {:.2f}%'.format(accuracy * 100))

Accuracy: 66.79%


In [12]:
torch.save(model, './models/bert.pt')