#**Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#**Installations**

In [None]:
pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.8-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.6.0-py3-none-any.whl (926 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.8-py3-none-any.whl (26 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.8 torchmetrics-1.6.0


# **Imports**

In [None]:
import kagglehub
import re
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
from torchmetrics import Accuracy
from transformers import BertModel
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import os

#**Load Dataset**

In [None]:
# Download latest version
path = kagglehub.dataset_download("endofnight17j03/bert-sentiment-analysis")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/endofnight17j03/bert-sentiment-analysis?dataset_version_number=1...


100%|██████████| 43.4k/43.4k [00:00<00:00, 31.3MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/endofnight17j03/bert-sentiment-analysis/versions/1





In [None]:
data = pd.read_csv('/content/drive/MyDrive/colab_projects/NLP/Sentiment Analysis.csv')

In [None]:
def handle_pickle(data=None, filepath="data.pkl", mode="save"):
    if mode == "save":
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, 'wb') as f:
            pickle.dump(data, f)
    elif mode == "load":
        with open(filepath, 'rb') as f:
            return pickle.load(f)
    else:
        raise ValueError("Mode should be either 'save' or 'load'")

In [None]:
data

Unnamed: 0,sentence,label
0,Ok brokeback mountain is such a horrible movie.,0
1,Brokeback Mountain was so awesome.,1
2,friday hung out with kelsie and we went and sa...,0
3,I am going to start reading the Harry Potter s...,1
4,"Is it just me, or does Harry Potter suck?...",0
...,...,...
5663,Brokeback Mountain was so awesome.,1
5664,The Da Vinci Code was absolutely AWESOME!,1
5665,"Oh, and Brokeback Mountain was a terrible movie.",0
5666,Combining the opinion / review from Gary and G...,0


In [None]:
data.shape

(5668, 2)

In [None]:
data = data.sample(1000)

In [None]:
os.makedirs('saved_data', exist_ok=True)

In [None]:
labels = data['label']

In [None]:
handle_pickle(data=labels, filepath="saved_data/labels.pkl", mode="save")

In [None]:
labels = handle_pickle(filepath="saved_data/labels.pkl", mode="load")

In [None]:
labels

Unnamed: 0,label
5101,0
1091,1
2874,1
2569,0
4149,1
...,...
3138,0
5152,0
2297,1
2923,1


In [None]:
summaries = data['sentence']
with open('Article.txt', 'w', encoding='utf-8') as file:
    for summary in summaries:
        file.write(summary + '\n')

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        summaries = file.readlines()
    return summaries

In [None]:
file_path = 'Article.txt'

Article = read_text_file(file_path)

for i, summary in enumerate(Article[:5]):
    print(f"Summary {i+1}: {summary.strip()}")

Summary 1: Brokeback Mountain is fucking horrible..
Summary 2: the story of Harry Potter is a deep and profound one, and I love Harry Potter.
Summary 3: the story of Harry Potter is a deep and profound one, and I love Harry Potter.
Summary 4: Watch it: 10 Things I Hate About You, The Patirot, A Knights Tale, The Four Feathers, Casanova, Lords of Dogtown, Monsters Ball, or Brokeback Mountain...
Summary 5: Brokeback mountain was beautiful...


#**Preprocessing**

In [None]:
def text_preprocessing(text):
    text = text.lower()
    text = re.sub(r"[^\w\d'\s]+", " ", text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text

In [None]:
preprocessed_text = [text_preprocessing(text) for text in Article]

In [None]:
for i, summary in enumerate(preprocessed_text[:5]):
    print(f"Summary {i+1}: {summary.strip()}")

Summary 1: brokeback mountain is fucking horrible
Summary 2: the story of harry potter is a deep and profound one and i love harry potter
Summary 3: the story of harry potter is a deep and profound one and i love harry potter
Summary 4: watch it 10 things i hate about you the patirot a knights tale the four feathers casanova lords of dogtown monsters ball or brokeback mountain
Summary 5: brokeback mountain was beautiful


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
preprocessed_text[0]

'brokeback mountain is fucking horrible'

In [None]:
sample_tokens = tokenizer(preprocessed_text[0], padding="max_length",
                         max_length=10, truncation=True,
                         return_tensors="pt")

In [None]:
sample_tokens

{'input_ids': tensor([[ 101, 2795, 4197, 3231, 1110, 8750, 9210,  102,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}

In [None]:
sample_tokens["input_ids"]

tensor([[ 101, 2795, 4197, 3231, 1110, 8750, 9210,  102,    0,    0]])

In [None]:
sample_tokens["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])

In [None]:
tokens = [tokenizer(i, padding="max_length", max_length=20,
                    truncation=True, return_tensors="pt")
         for i in tqdm(preprocessed_text)]

100%|██████████| 1000/1000 [00:00<00:00, 1827.73it/s]


In [None]:
handle_pickle(data=tokens, filepath="saved_data/tokens.pkl", mode="save")

In [None]:
handle_pickle(data=tokens, filepath="saved_data/tokens.pkl", mode="load")[0]

{'input_ids': tensor([[ 101, 2795, 4197, 3231, 1110, 8750, 9210,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        for param in self.bert.parameters():
            param.required_grad = True
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.LazyLinear(num_classes)
        self.activation = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        _, bert_output = self.bert(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  return_dict=False)
        final_output = self.fc(self.activation(self.dropout(bert_output)))
        return final_output

#**Inits**

In [None]:
model_path = "saved_data/bert_pre_trained.pth"

In [None]:
num_classes = len(labels.unique())

# **Utils**

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def train(model, train_loader, loss_fn, optimizer, device, model_path, num_classes, epoch):
    model.train()
    loss_train = AverageMeter()
    acc_train = Accuracy(task='MULTICLASS', num_classes=num_classes).to(device)

    with tqdm(train_loader, unit="batch") as tepoch:
        tepoch.set_description(f"Epoch {epoch + 1} of {num_epochs}")
        for input_ids, attention_mask, batch_labels in tepoch:
            # Move data to device
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            batch_labels = batch_labels.to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask)
            outputs = torch.squeeze(outputs)

            # Compute loss and accuracy
            loss = loss_fn(outputs, batch_labels)
            loss_train.update(loss.item())
            acc_train(outputs, batch_labels.int())

            # Backward pass and optimizer step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            tepoch.set_postfix(loss=loss_train.avg,
                               accuracy=100. * acc_train.compute().item())

            # Store training metrics
            loss_train_hist.append(loss_train.avg)
            acc_train_hist.append(acc_train.compute().item())

    return model, loss_train.avg, acc_train.compute().item()

In [None]:
from torchmetrics import Accuracy

def validation(model, valid_loader, loss_fn, device, num_classes, epoch):

    model.eval()
    loss_valid = AverageMeter()
    acc_valid = Accuracy(task='MULTICLASS', num_classes=num_classes).to(device)

    with torch.no_grad():
        for input_ids, attention_mask, batch_labels in valid_loader:

            # Move data to device
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            batch_labels = batch_labels.to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask)
            outputs = torch.squeeze(outputs)

            # Compute loss and accuracy
            loss = loss_fn(outputs, batch_labels)
            loss_valid.update(loss.item())
            acc_valid(outputs, batch_labels.int())

    return loss_valid.avg, acc_valid.compute().item()

In [None]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels):
        # Ensure tokens contain the 'input_ids' and 'attention_mask'
        self.input_ids = [token['input_ids'].squeeze(0) for token in tokens]
        self.attention_mask = [token['attention_mask'].squeeze(0) for token in tokens]

        # Check if the length of tokens and labels match
        assert len(self.input_ids) == len(labels), f"Mismatch in number of tokens ({len(self.input_ids)}) and labels ({len(labels)})"
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Ensure the idx is within the correct range
        return self.input_ids[idx], self.attention_mask[idx], self.labels[idx]

In [None]:
# Reset index for consistency
labels = labels.reset_index(drop=True)

# Ensure tokens are aligned with labels
tokens = [tokenizer(text, padding="max_length", max_length=20, truncation=True, return_tensors="pt") for text in tqdm(preprocessed_text)]
tokens = [tokenizer(text, padding="max_length", max_length=20, truncation=True, return_tensors="pt") for text in tqdm(preprocessed_text)]

100%|██████████| 1000/1000 [00:00<00:00, 3081.82it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2869.83it/s]


In [None]:
tokens = handle_pickle(data=tokens, filepath="saved_data/tokens.pkl", mode="load")
labels = handle_pickle(data=labels, filepath="saved_data/labels.pkl", mode="load")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tokens, labels, test_size=0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25)

In [None]:
train_set = TextDataset(X_train, y_train.tolist())
valid_set = TextDataset(X_valid, y_valid.tolist())
test_set = TextDataset(X_test, y_test.tolist())

In [None]:
train_loader = DataLoader(train_set, batch_size=64, shuffle=True, drop_last=True)
valid_loader = DataLoader(valid_set, batch_size=64)
test_loader = DataLoader(test_set, batch_size=64)

In [None]:
model = BertClassifier(dropout=0.5, num_classes=num_classes)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
if torch.cuda.is_available():
    model = model.cuda()
    loss_fn = loss_fn.cuda()

#**Training**

In [None]:
loss_train_hist = []
loss_valid_hist = []

acc_train_hist = []
acc_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [None]:
model_path = 'saved_data/model.pth'

In [None]:
num_epochs = 20

for epoch in range(num_epochs):
  # Train
  model, loss_train, acc_train = train(model,
                                       train_loader,
                                       loss_fn,
                                       optimizer,
                                       device,
                                       model_path,
                                       num_classes,
                                       num_epochs)
  # Validation
  loss_valid, acc_valid = validation(model,
                                     valid_loader,
                                     loss_fn,
                                     device,
                                     num_classes,
                                     num_epochs)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  acc_train_hist.append(acc_train)
  acc_valid_hist.append(acc_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'saved_data/model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Acc = {acc_valid:.4}')
  print()

  epoch_counter += 1

Epoch 21 of 20: 100%|██████████| 9/9 [00:03<00:00,  2.72batch/s, accuracy=58.5, loss=0.675]


Model Saved!
Valid: Loss = 0.6766, Acc = 0.505



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.93batch/s, accuracy=58.3, loss=0.648]


Model Saved!
Valid: Loss = 0.6346, Acc = 0.505



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.94batch/s, accuracy=64.9, loss=0.601]


Model Saved!
Valid: Loss = 0.5847, Acc = 0.62



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.92batch/s, accuracy=72.4, loss=0.545]


Model Saved!
Valid: Loss = 0.5263, Acc = 0.895



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.82batch/s, accuracy=86.8, loss=0.487]


Model Saved!
Valid: Loss = 0.4559, Acc = 0.96



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.86batch/s, accuracy=96.9, loss=0.421]


Model Saved!
Valid: Loss = 0.39, Acc = 0.96



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.68batch/s, accuracy=99.3, loss=0.359]


Model Saved!
Valid: Loss = 0.3248, Acc = 0.98



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.88batch/s, accuracy=99.7, loss=0.299]


Model Saved!
Valid: Loss = 0.2766, Acc = 0.98



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.52batch/s, accuracy=99.8, loss=0.246]


Model Saved!
Valid: Loss = 0.2448, Acc = 0.98



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.69batch/s, accuracy=99.8, loss=0.208]


Model Saved!
Valid: Loss = 0.2269, Acc = 0.975



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.86batch/s, accuracy=100, loss=0.166]


Model Saved!
Valid: Loss = 0.2152, Acc = 0.97



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.85batch/s, accuracy=100, loss=0.135]


Model Saved!
Valid: Loss = 0.1978, Acc = 0.975



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.64batch/s, accuracy=100, loss=0.116]


Model Saved!
Valid: Loss = 0.1914, Acc = 0.975



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.63batch/s, accuracy=100, loss=0.102]


Model Saved!
Valid: Loss = 0.189, Acc = 0.975



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.78batch/s, accuracy=100, loss=0.0917]


Model Saved!
Valid: Loss = 0.1873, Acc = 0.975



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.78batch/s, accuracy=100, loss=0.0825]


Model Saved!
Valid: Loss = 0.1862, Acc = 0.975



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.79batch/s, accuracy=100, loss=0.0761]


Valid: Loss = 0.1877, Acc = 0.975



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.78batch/s, accuracy=100, loss=0.0713]


Valid: Loss = 0.1883, Acc = 0.975



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.76batch/s, accuracy=100, loss=0.0657]


Valid: Loss = 0.1898, Acc = 0.975



Epoch 21 of 20: 100%|██████████| 9/9 [00:01<00:00,  4.74batch/s, accuracy=100, loss=0.0629]


Valid: Loss = 0.1914, Acc = 0.975



# **Testing**

In [None]:
torch.save(model.state_dict(), 'saved_data/model.pth')

In [None]:
input_text = '''This movie is receiving undeserved review bombs from the cesspool of 1) an entitled, paranoid fan base who truly believe this film was an intentional act of subversion by Hollywood to undermine the virtues of the Batman franchise, 2) popcorn enthusiasts without a single artistic, or empathetic bone in their body who desire nothing more than gratuitous violence that is formulaic and devoid of anything meaningful to say, and 3) critics of any form of cinematic art that dares deviate from this apparent passionless palette that requires zero intellectual engagement and mere surface level considerations.

Joker: Folie á Deux explores the complexity and depth of Joker and Quinn's characters in a way that humanizes them and shines a spotlight on the damaging repercussions of institutional stigmatization of untreated childhood trauma. The musical numbers inject a rawness to the fantasy that Joker and Quinn are living and are incredibly apt in choice for adding more nuance and connotation to the scenes. I get that it was unexpected heading into it but rather than reject it outright, consider what each song might be trying to say about the dynamic experience! I suppose that is asking a bit much of the expected audience and the reviews reflect it.

In summary, don't listen to the naysayers who have the patience of a firecracker and the artistic discernment of a fascist. Go support this film and really dive into the depth it is conveying about the fragility of the human psyche yet the power of agency and redemption. Listen to the writers' subtle messages throughout, notice the tie ins between scenes, and reflect on the social commentary. There is much to be engaged with here if you just check your woefully banal expectations at the concession stands. '''

In [None]:
input_text = input_text.lower()
input_text = re.sub(r"[^\w\d'\s]+", " ", input_text)
input_text = re.sub("\d+", "", input_text)
input_text = re.sub(r'[x]{2,}', "", input_text)
input_text = re.sub(' +', ' ', input_text)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
tokens = tokenizer(input_text, padding="max_length",
                 max_length=20, truncation=True,
                 return_tensors="pt")

In [None]:
input_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available()
                     else "cpu")

In [None]:
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

In [None]:
input_ids = torch.squeeze(input_ids, 1)

In [None]:
num_classes = len(labels.unique())

In [None]:
# Create model object
model = BertClassifier(0.5, num_classes)

# Load trained weights
model.load_state_dict(torch.load(model_path, weights_only=True))

# Move the model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()

# Forward pass
# squeeze ensures the output tensor is of the correct dimensionality.
out = torch.squeeze(model(input_ids, attention_mask))

predicted_class_idx = torch.argmax(out, dim=-1)

# Find predicted class
prediction = data['label'].iloc[predicted_class_idx.item()]
print(f"Predicted Class: {prediction}")

Predicted Class: 1
