In [52]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [53]:
# Load the dataset
df = pd.read_csv("IMDB_Dataset.csv")

df = df.iloc[:10000]
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [54]:
# Remove HTML tags
df['review'] = df['review'].apply(lambda x: re.sub(r'<.*?>', '', x))

In [55]:
# Lowercasing 
df['review'] = df['review'].apply(lambda x: x.lower())

In [56]:

df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
9995,"fun, entertaining movie about wwii german spy ...",positive
9996,give me a break. how can anyone say that this ...,negative
9997,this movie is a bad movie. but after watching ...,negative
9998,this is a movie that was probably made to ente...,negative


In [57]:
# Map labels to binary
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['review'], df['sentiment'], test_size=0.2)

In [58]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1
...,...,...
9995,"fun, entertaining movie about wwii german spy ...",1
9996,give me a break. how can anyone say that this ...,0
9997,this movie is a bad movie. but after watching ...,0
9998,this is a movie that was probably made to ente...,0


In [59]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, return_tensors="pt")

train_labels = torch.tensor(train_labels.tolist())
val_labels = torch.tensor(val_labels.tolist())



In [60]:
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = torch.utils.data.TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [61]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # loop over the dataset multiple times
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
model.eval()
val_preds, val_labels_list = [], []
for batch in val_loader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    val_preds.extend(logits.argmax(dim=1).tolist())
    val_labels_list.extend(labels.tolist())

# Classification report
print(classification_report(val_labels_list, val_preds, target_names=['Negative', 'Positive']))


              precision    recall  f1-score   support

    Negative       0.90      0.95      0.92       955
    Positive       0.95      0.90      0.92      1045

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000

