In [16]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset
import torch
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
import warnings
warnings.filterwarnings("ignore")

In [17]:
data = load_dataset("imdb")

train_texts = data["train"]["text"][:5000]
train_labels = data["train"]["label"][:5000]

test_texts = data["test"]["text"][:1000]
test_labels = data["test"]["label"][:1000]

#load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [18]:
class SentimentalData(Dataset):
    def __init__(self, texts, labels, tokentizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokentizer = tokentizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = self.tokentizer(
            text,
            max_length = self.max_length,
            padding = "max_length",
            truncation = True,
            return_tensors = "pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype = torch.long)
        }

In [19]:
max_length = 256
batch_size = 16

train_data = SentimentalData(train_texts, train_labels, tokenizer, max_length)
test_data = SentimentalData(test_texts, test_labels, tokenizer, max_length)

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [20]:
#loading the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [21]:
#optimizer
optimizer = AdamW(model.parameters(), lr = 5e-5)

In [22]:
#training parameters
epochs = 1
for i in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids = input_ids, attention_mask=attention_mask, labels = labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {i + 1}/{epochs}, Training Loss: {avg_loss:.4f}")

Epoch 1/1, Training Loss: 0.0093


In [23]:
# Evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

In [24]:
# Accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 100.00%


Sentimental analysis using Other NLP techniques

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import fetch_openml
from datasets import load_dataset

In [46]:
dataset = load_dataset("imdb")

# Extract texts and labels
texts = dataset["train"]["text"]  # Reviews
labels = dataset["train"]["label"] 

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size = 0.3, random_state = 42
)


In [47]:
#pre-processing
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words="english")
train_features = vectorizer.fit_transform(train_texts)
test_features = vectorizer.transform(test_texts)


In [48]:
#Logistic Regression
classifer = LogisticRegression(max_iter=1000)
classifer.fit(train_features, train_labels)

predictions = classifer.predict(test_features)


In [49]:
accracy = accuracy_score(test_labels, predictions)
print("Logistic Regression accuracy: ", accuracy*100)

Logistic Regression accuracy:  100.0


In [50]:
print(classification_report(test_labels, predictions))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      3752
           1       0.87      0.90      0.88      3748

    accuracy                           0.88      7500
   macro avg       0.88      0.88      0.88      7500
weighted avg       0.88      0.88      0.88      7500

