In [12]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
import os

In [3]:
import pandas as pd
# Load the CSV files
train_df = pd.read_csv('train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv('test.csv', encoding='ISO-8859-1')


In [4]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26
...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,night,31-45,Ghana,31072940,227540.0,137
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,morning,46-60,Greece,10423054,128900.0,81
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,noon,60-70,Grenada,112523,340.0,331
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,night,70-100,Guatemala,17915568,107160.0,167


In [5]:
test_df

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0
...,...,...,...,...,...,...,...,...,...
4810,,,,,,,,,
4811,,,,,,,,,
4812,,,,,,,,,
4813,,,,,,,,,


In [6]:
train_df = train_df.dropna(subset=['text', 'sentiment'])
test_df = test_df.dropna(subset=['text', 'sentiment'])

In [7]:
len(train_df), len(test_df)

(27480, 3534)

In [8]:
import pandas as pd

# Mapping sentiments to labels using enumerate
sentiment_to_label = {sentiment: label for label, sentiment in enumerate(['negative', 'neutral', 'positive'])}

# Keep only 'text' and 'sentiment', then add 'label'
train_df = train_df[['text', 'sentiment']].copy()
train_df['label'] = train_df['sentiment'].map(sentiment_to_label)

test_df = test_df[['text', 'sentiment']].copy()
test_df['label'] = test_df['sentiment'].map(sentiment_to_label)


In [9]:
train_df

Unnamed: 0,text,sentiment,label
0,"I`d have responded, if I were going",neutral,1
1,Sooo SAD I will miss you here in San Diego!!!,negative,0
2,my boss is bullying me...,negative,0
3,what interview! leave me alone,negative,0
4,"Sons of ****, why couldn`t they put them on t...",negative,0
...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,0
27477,I`ve wondered about rake to. The client has ...,negative,0
27478,Yay good for both of you. Enjoy the break - y...,positive,2
27479,But it was worth it ****.,positive,2


In [10]:
test_df

Unnamed: 0,text,sentiment,label
0,Last session of the day http://twitpic.com/67ezh,neutral,1
1,Shanghai is also really exciting (precisely -...,positive,2
2,"Recession hit Veronique Branquinho, she has to...",negative,0
3,happy bday!,positive,2
4,http://twitpic.com/4w75p - I like it!!,positive,2
...,...,...,...
3529,"its at 3 am, im very tired but i can`t sleep ...",negative,0
3530,All alone in this old house again. Thanks for...,positive,2
3531,I know what you mean. My little dog is sinkin...,negative,0
3532,_sutra what is your next youtube video gonna b...,positive,2


In [13]:
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)

In [14]:
class customDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [16]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Create datasets
train_dataset = customDataset(
    texts=train_df["text"].tolist(),
    labels=train_df['label'].tolist(),
    tokenizer=tokenizer
)

val_dataset = customDataset(
    texts=val_df["text"].tolist(),
    labels=val_df['label'].tolist(),
    tokenizer=tokenizer
)

test_dataset = customDataset(
    texts=test_df["text"].tolist(),
    labels=test_df['label'].tolist(),
    tokenizer=tokenizer
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
train_loader = DataLoader(train_dataset, batch_size=40, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=40, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=40, shuffle=False, pin_memory=True)

In [18]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    problem_type="single_label_classification"
)
model.to(device)

# Set hyperparameters
epochs = 5
learning_rate = 3e-5
weight_decay = 0.01

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
total_steps = len(train_loader) * epochs
warmup_steps = len(train_loader) // 10  # 10% of steps for warmup

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def train_epoch(model, data_loader, optimizer, scheduler, device):
    """Train for one epoch"""
    model.train()
    running_loss = 0.0
    progress_interval = max(1, len(data_loader) // 5)  # Update every 20%

    for batch_idx, batch in enumerate(data_loader):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        running_loss += loss.item()

        # Backward pass
        loss.backward()

        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Optimizer step
        optimizer.step()
        scheduler.step()

        # Print progress
        if (batch_idx + 1) % progress_interval == 0:
            print(f"Batch {batch_idx+1}/{len(data_loader)} | Loss: {loss.item():.4f}")

    # Calculate average loss
    epoch_loss = running_loss / len(data_loader)
    return epoch_loss

def evaluate_model(model, data_loader, device):
    """Evaluate the model"""
    model.eval()
    total_loss = 0
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in data_loader:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            # Get predictions
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            # Store predictions and labels
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(preds.cpu().numpy())

    avg_loss = total_loss / len(data_loader)

    # Calculate metrics
    accuracy = np.mean(np.array(all_predictions) == np.array(all_labels))

    # Calculate precision, recall, f1-score per class
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_predictions, average=None, zero_division=0
    )

    return avg_loss, accuracy, precision, recall, f1, all_labels, all_predictions

In [21]:
label_names = ['negative', 'neutral', 'positive']
history = {
    'train_loss': [],
    'val_loss': [],
    'test_loss': [],
    'val_accuracy': [],
    'test_accuracy': [],
    'val_precision': {label: [] for label in label_names},
    'val_recall': {label: [] for label in label_names},
    'val_f1': {label: [] for label in label_names}
}

# Add GPU memory monitoring
if torch.cuda.is_available():
    history['gpu_memory_usage'] = []


In [22]:
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    print("-" * 50)

    # Train one epoch
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    history['train_loss'].append(train_loss)

    # Monitor GPU memory if available
    if torch.cuda.is_available():
        memory_allocated = torch.cuda.memory_allocated(0) / 1e9  # in GB
        history['gpu_memory_usage'].append(memory_allocated)
        print(f"GPU Memory Usage: {memory_allocated:.2f} GB")

    # Evaluate on validation set
    val_loss, val_accuracy, val_precision, val_recall, val_f1, _, _ = evaluate_model(model, val_loader, device)
    history['val_loss'].append(val_loss)
    history['val_accuracy'].append(val_accuracy)

    # Evaluate on test set
    test_loss, test_accuracy, _, _, _, _, _ = evaluate_model(model, test_loader, device)
    history['test_loss'].append(test_loss)
    history['test_accuracy'].append(test_accuracy)

    # Store precision and recall for each class
    for i, label in enumerate(label_names):
        if i < len(val_precision):
            history['val_precision'][label].append(val_precision[i])
            history['val_recall'][label].append(val_recall[i])
            history['val_f1'][label].append(val_f1[i])

    # Print epoch summary
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f}")

    # Print per-class metrics
    print("\nPer-class Validation Metrics:")
    print("-" * 50)
    print(f"{'Class':<15} {'Precision':<10} {'Recall':<10} {'F1-Score':<10}")
    print("-" * 50)
    for i, label in enumerate(label_names):
        if i < len(val_precision):
            print(f"{label:<15} {val_precision[i]:.4f}{' ':6} {val_recall[i]:.4f}{' ':6} {val_f1[i]:.4f}")
    print("-" * 50)


Epoch 1/5
--------------------------------------------------
Batch 110/550 | Loss: 0.7213
Batch 220/550 | Loss: 0.4257
Batch 330/550 | Loss: 0.6635
Batch 440/550 | Loss: 0.3573
Batch 550/550 | Loss: 0.5038
GPU Memory Usage: 1.78 GB
Training Loss: 0.5653
Validation Loss: 0.5335 | Validation Accuracy: 0.7833
Test Loss: 0.5276 | Test Accuracy: 0.7838

Per-class Validation Metrics:
--------------------------------------------------
Class           Precision  Recall     F1-Score  
--------------------------------------------------
negative        0.7284       0.8740       0.7946
neutral         0.8025       0.6707       0.7307
positive        0.8210       0.8468       0.8337
--------------------------------------------------

Epoch 2/5
--------------------------------------------------
Batch 110/550 | Loss: 0.2363
Batch 220/550 | Loss: 0.2025
Batch 330/550 | Loss: 0.3218
Batch 440/550 | Loss: 0.4083
Batch 550/550 | Loss: 0.7546
GPU Memory Usage: 1.78 GB
Training Loss: 0.3975
Validation Los

In [23]:
model.eval()
_, _, test_precision, test_recall, test_f1, all_labels, all_predictions = evaluate_model(model, test_loader, device)

# Classification report
print("Final Classification Report on Test Set:")
print(classification_report(all_labels, all_predictions, target_names=label_names))

Final Classification Report on Test Set:
              precision    recall  f1-score   support

    negative       0.78      0.80      0.79      1001
     neutral       0.75      0.75      0.75      1430
    positive       0.84      0.82      0.83      1103

    accuracy                           0.78      3534
   macro avg       0.79      0.79      0.79      3534
weighted avg       0.78      0.78      0.78      3534



In [24]:
test_texts = [
    "Its absolutely amazing!",
    "Nothing much to say. Okay okayish feel.",
    "I regret about my decision."
]

print(f"{'Text':<50} {'Prediction'}")
print("-" * 75)

for text in test_texts:
    model.eval()
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1)

    label_map = {0: "negative", 1: "neutral", 2: "positive"}
    sentiment = label_map[preds.item()]
    print(f"{text:<70} {sentiment}")


Text                                               Prediction
---------------------------------------------------------------------------
Its absolutely amazing!                                                positive
Nothing much to say. Okay okayish feel.                                neutral
I regret about my decision.                                            negative
