In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

In [None]:
df=pd.read_csv('amazone_labeled_data.csv')

In [None]:
# df=df[0:500]

In [None]:
df.head()

Unnamed: 0,Rating,comments,Review_content,Sentiment,Sentiment_Label
0,3,I Like It!!,"Call me strange, but I actually liked this mov...",negative,-1
1,4,I love Albert King,"Very good, stage directions should have been e...",positive,1
2,4,U-boat basics,If you are looking for a good overview of WWII...,positive,1
3,1,Incomplete,Help me out. If anyone has figured out how to ...,positive,1
4,2,Not as titled,Regrettably seventy five percent of the book i...,neutral,0


In [None]:
df['Sentiment'].value_counts()

positive    72734
negative    20262
neutral      7004
Name: Sentiment, dtype: int64

In [None]:
# Create balanced DataFrame
df = pd.concat([
    df[df['Sentiment'] == 'positive'].sample(n=7004, replace=True),
    df[df['Sentiment'] == 'negative'].sample(n=7004, replace=True),
    df[df['Sentiment'] == 'neutral']
])

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df['Sentiment'].value_counts()

neutral     7004
negative    7004
positive    7004
Name: Sentiment, dtype: int64

In [None]:
# Mapping labels to class indices
label_map = {-1: 0, 0: 1, 1: 2}


In [None]:
# Split the dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize and preprocess the text
def preprocess_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,  # Adjust as needed
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )


In [None]:
# Apply tokenization to the training and testing data
train_texts = train_df['Review_content'].apply(preprocess_text)
test_texts = test_df['Review_content'].apply(preprocess_text)


In [None]:
# Convert the tokenized data to PyTorch tensors
train_inputs = torch.cat([t['input_ids'] for t in train_texts], dim=0)
train_masks = torch.cat([t['attention_mask'] for t in train_texts], dim=0)
train_labels = torch.tensor([label_map[label] for label in train_df['Sentiment_Label'].values])


In [None]:
test_inputs = torch.cat([t['input_ids'] for t in test_texts], dim=0)
test_masks = torch.cat([t['attention_mask'] for t in test_texts], dim=0)
test_labels = torch.tensor([label_map[label] for label in test_df['Sentiment_Label'].values])


In [None]:
# Create DataLoader for training and testing data
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)


In [None]:
batch_size = 32 # Adjust as needed
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# Set up the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()





In [None]:
# Fine-tune the BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
num_epochs = 5  # Adjust as needed
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()
    for batch_num, batch in enumerate(train_dataloader, 1):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Batch {batch_num}/{len(train_dataloader)}, Loss: {loss.item()}")

print("Training completed.")


Epoch 1/5
Epoch 1, Batch 1/526, Loss: 1.1394665241241455
Epoch 1, Batch 2/526, Loss: 1.1355878114700317
Epoch 1, Batch 3/526, Loss: 1.1015625
Epoch 1, Batch 4/526, Loss: 1.0769981145858765
Epoch 1, Batch 5/526, Loss: 1.1334941387176514
Epoch 1, Batch 6/526, Loss: 1.1178030967712402
Epoch 1, Batch 7/526, Loss: 1.1379609107971191
Epoch 1, Batch 8/526, Loss: 1.0931389331817627
Epoch 1, Batch 9/526, Loss: 1.159854769706726
Epoch 1, Batch 10/526, Loss: 1.0638673305511475
Epoch 1, Batch 11/526, Loss: 1.061073899269104
Epoch 1, Batch 12/526, Loss: 1.1117125749588013
Epoch 1, Batch 13/526, Loss: 1.0546767711639404
Epoch 1, Batch 14/526, Loss: 1.1179567575454712
Epoch 1, Batch 15/526, Loss: 1.1937580108642578
Epoch 1, Batch 16/526, Loss: 1.101919412612915
Epoch 1, Batch 17/526, Loss: 1.0662540197372437
Epoch 1, Batch 18/526, Loss: 1.1539125442504883
Epoch 1, Batch 19/526, Loss: 1.0587520599365234
Epoch 1, Batch 20/526, Loss: 1.0946452617645264
Epoch 1, Batch 21/526, Loss: 1.1101391315460205
Epo

In [None]:
# Evaluate the model on the test set
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [None]:
model.save_pretrained('./fine-tuned-sentiment-model')

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(all_labels, all_preds)
classification_rep = classification_report(all_labels, all_preds)
conf_matrix = confusion_matrix(all_labels, all_preds)

In [None]:
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.7899119676421603
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.75      0.80      1454
           1       0.70      0.72      0.71      1352
           2       0.82      0.89      0.85      1397

    accuracy                           0.79      4203
   macro avg       0.79      0.79      0.79      4203
weighted avg       0.79      0.79      0.79      4203

Confusion Matrix:
[[1094  303   57]
 [ 153  979  220]
 [  32  118 1247]]


* (1, 1): 1094 instances were correctly classified as class 1 (positive).
* (1, 2): 303 instances of class 1 were misclassified as class 2 (false negatives for class 1, false positives for class 2).
* (1, 3): 57 instances of class 1 were misclassified as class 3.
* (2, 1): 153 instances of class 2 were misclassified as class 1 (false negatives for class 2, false positives for class 1).
* (2, 2): 979 instances were correctly classified as class 2.
* (2, 3): 220 instances of class 2 were misclassified as class 3 (false negatives for class 2, false positives for class 3).
* (3, 1): 32 instances of class 3 were misclassified as class 1 (false negatives for class 3, false positives for class 1).
* (3, 2): 118 instances of class 3 were misclassified as class 2 (false negatives for class 3, false positives for class 2).
* (3, 3): 1247 instances were correctly classified as class 3.


In [None]:
param_grid = {
    'learning_rate': [1e-5, 2e-5, 3e-5, 5e-5],
    'batch_size': [4, 8, 16,32]
}
