In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.optim import AdamW
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/home/mufseera/Bert_for_classification/IMDB Dataset.csv")
    

In [3]:
df.shape

(50000, 2)

In [4]:
df["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [5]:
df_positive = df[df["sentiment"]== "positive"].sample(1000,random_state=42).reset_index(drop=True)
df_negative = df[df["sentiment"]== "negative"].sample(1000,random_state=42).reset_index(drop=True)

In [6]:
sampled_df = pd.concat([df_positive,df_negative]).sample(frac=1,random_state=42).reset_index(drop=True)

In [7]:
sampled_df.isna().sum()

review       0
sentiment    0
dtype: int64

In [8]:
texts = sampled_df["review"].tolist()
labels = sampled_df["sentiment"].tolist()

In [9]:
labels = [1 if j=="positive" else 0 for j in labels]

In [10]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels= labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self,idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text,return_tensors="pt",max_length=self.max_length,padding="max_length",truncation=True)
        return {"input_ids":encoding["input_ids"].flatten(),"attention_mask":encoding['attention_mask'].flatten(),"label":torch.tensor(label)}

In [11]:
train_texts,val_texts, train_labels, val_labels = train_test_split(texts, labels,test_size=.15,random_state=42)

In [15]:
bert_model_name = 'bert-base-uncased'
max_length= 128
num_classes = 2
batch_size = 16
num_epochs = 1
learning_rate = 2e-5

In [16]:

tokenizer = BertTokenizer.from_pretrained(bert_model_name)

In [17]:
train_dataset = TextClassificationDataset(train_texts,train_labels, tokenizer, max_length)
test_dataset = TextClassificationDataset(val_texts,val_labels, tokenizer, max_length)


In [18]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


In [19]:
class BertClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids = input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertClassifier(bert_model_name,num_classes).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
optimizer = AdamW(model.parameters(),lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [22]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        outputs = model(input_ids=input_ids, attention_mask =attention_mask)
        loss = nn.CrossEntropyLoss()(outputs,labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [23]:
def evaluate(model, data_loader,device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            outputs = model(input_ids= input_ids,attention_mask=attention_mask)
            _,pred = torch.max(outputs,dim=1)
            predictions.extend(pred.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels,predictions), classification_report(actual_labels,predictions)


In [24]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch +1}/{num_epochs}")
    train(model, train_dataloader,optimizer,scheduler, device)
    accuracy,report = evaluate(model, test_dataloader,device)
    print(f"validation accuracy{accuracy}")
    print(report)

Epoch 1/1
validation accuracy0.86
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       147
           1       0.86      0.86      0.86       153

    accuracy                           0.86       300
   macro avg       0.86      0.86      0.86       300
weighted avg       0.86      0.86      0.86       300



In [29]:
def predict_sentiment(text,model,tokenizer,device,max_length=128):
    model.eval()
    encoding = tokenizer(text,return_tensors="pt",max_length=max_length,padding="max_length",truncation=True)
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    with torch.no_grad():
        outputs = model(input_ids=input_ids,attention_mask=attention_mask)
        _,pred = torch.max(outputs,dim=1)
    return "positive" if pred.item() == 1 else "negative"

In [30]:
torch.save(model.state_dict(),"bert_clasifier.pth")

In [31]:
test_text = "The movie was great and I really enjoyed the performances of the actors."
sentiment = predict_sentiment(test_text, model, tokenizer, device)

In [32]:
sentiment

'positive'