# Step 1: Import Libraries

In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
import os
import kagglehub

In [2]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Step 2: Download and Load Data from Kaggle

In [4]:
# Download latest version from Kaggle
dataset_path = kagglehub.dataset_download("abhi8923shriv/sentiment-analysis-dataset")
print("Path to dataset files:", dataset_path)

# Load data (adjust file names based on what's available in the dataset)
train_file = os.path.join(dataset_path, "train.csv")
test_file = os.path.join(dataset_path, "test.csv")

Path to dataset files: /kaggle/input/sentiment-analysis-dataset


In [5]:
train_df = pd.read_csv(train_file, encoding="latin1")
test_df = pd.read_csv(test_file, encoding="latin1")

In [10]:
# Display a few rows
print("Training data sample:")
display(train_df.sample(3))
print("\nTest data sample:")
display(test_df.sample(2))

Training data sample:


Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
19955,0f5fc32d1d,RAM upgrade=done! FF still slow so disabling a...,still slow,negative,night,70-100,Niger,24206644,1266700.0,19
12589,d7b2a057c4,If you come to a gig in Dublin you can be my ...,If you come to a gig in Dublin you can be my d...,neutral,noon,21-30,Saint Kitts and Nevis,53199,260.0,205
13277,d8ac4aef96,You`re watching Firepro0f?,You`re watching Firepro0f?,neutral,night,70-100,Netherlands,17134872,33720.0,508



Test data sample:


Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
2356,2fe4f05a35,_louise bless ya... I know that feeling all t...,positive,noon,21-30,Sudan,43849260.0,1765048.0,25.0
2622,15db00c707,OmG U NasTY,negative,morning,46-60,Cuba,11326616.0,106440.0,106.0


In [9]:
print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")

Training data shape: (27481, 10)
Testing data shape: (4815, 9)


# Step 3: Data Preprocessing

In [13]:
train_df = pd.DataFrame(train_df[['text', 'sentiment']])
test_df = pd.DataFrame(test_df[['text', 'sentiment']])

In [14]:
# Check for missing values
print("\nMissing values in training data:")
print(train_df.isnull().sum())
print("\nMissing values in test data:")
print(test_df.isnull().sum())


Missing values in training data:
text         1
sentiment    0
dtype: int64

Missing values in test data:
text         1281
sentiment    1281
dtype: int64


In [16]:
# Handle missing values
train_df = train_df.dropna()
test_df = test_df.dropna()

In [17]:
print("\nMissing values in training data:")
print(train_df.isnull().sum())
print("\nMissing values in test data:")
print(test_df.isnull().sum())


Missing values in training data:
text         0
sentiment    0
dtype: int64

Missing values in test data:
text         0
sentiment    0
dtype: int64


In [19]:
print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")

Training data shape: (27480, 2)
Testing data shape: (3534, 2)


In [20]:
# Check unique sentiment values
unique_sentiments = set(train_df["sentiment"].unique()) | set(test_df["sentiment"].unique())
print(f"\nUnique sentiment values: {unique_sentiments}")


Unique sentiment values: {'positive', 'negative', 'neutral'}


In [22]:
sentiment_map = {sentiment: i for i, sentiment in enumerate(unique_sentiments)}
sentiment_map

{'positive': 0, 'negative': 1, 'neutral': 2}

In [24]:
# Map sentiment labels to integers
train_df['label'] = train_df["sentiment"].map(sentiment_map)
test_df['label'] = test_df["sentiment"].map(sentiment_map)

In [25]:
print("\nTraining data label distribution:")
print(train_df['label'].value_counts())
print("\nTest data label distribution:")
print(test_df['label'].value_counts())


Training data label distribution:
label
2    11117
0     8582
1     7781
Name: count, dtype: int64

Test data label distribution:
label
2    1430
0    1103
1    1001
Name: count, dtype: int64


# Step 4: Create Dataset Class

In [26]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Step 5: Set up Tokenizer and Model

In [27]:
# Load pre-trained tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(sentiment_map),  # Dynamic number of labels
    problem_type="single_label_classification"
)
model.to(device)
print(f"Model {model_name} loaded successfully with {len(sentiment_map)} output classes!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model bert-base-uncased loaded successfully with 3 output classes!


# Step 6: Create Datasets and DataLoaders

In [29]:
# Create datasets
train_dataset = SentimentDataset(
    texts=train_df["text"].tolist(),
    labels=train_df['label'].tolist(),
    tokenizer=tokenizer
)

test_dataset = SentimentDataset(
    texts=test_df["text"].tolist(),
    labels=test_df['label'].tolist(),
    tokenizer=tokenizer
)

# Check dataset sizes
print(f"Training dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Training dataset size: 27480
Test dataset size: 3534


In [31]:
# Create data loaders
batch_size = 64
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size*2,
    shuffle=False
)


# Step 7: Define Training,Evaluation Functions

In [32]:
def train_epoch(model, data_loader, optimizer, scheduler):
    """Train for one epoch"""
    model.train()
    running_loss = 0.0
    progress_interval = max(1, len(data_loader) // 5)  # Update every 20%

    for batch_idx, batch in enumerate(data_loader):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        running_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        # Print progress
        if (batch_idx + 1) % progress_interval == 0:
            print(f"Batch {batch_idx+1}/{len(data_loader)} | Loss: {loss.item():.4f}")

    epoch_loss = running_loss / len(data_loader)
    return epoch_loss

def evaluate_model(model, data_loader):
    """Evaluate the model on the given data loader"""
    model.eval()
    total_loss = 0
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in data_loader:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            total_loss += outputs.loss.item()

            # Get predictions
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            # Store predictions and labels
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(preds.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    accuracy = np.mean(np.array(all_predictions) == np.array(all_labels))

    return avg_loss, accuracy, all_labels, all_predictions

#Step 8 : hyperparameters and Training

In [None]:
# hyperparameters
epochs = 5
learning_rate = 3e-5

# Initialize optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
total_steps = len(train_loader) * epochs
warmup_steps = len(train_loader) // 10  # 10% of steps for warmup

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Training history
history = {
    'train_loss': [],
    'test_loss': [],
    'test_accuracy': []
}

# Training loop
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    print("-" * 30)

    # Train one epoch
    train_loss = train_epoch(model, train_loader, optimizer, scheduler)
    history['train_loss'].append(train_loss)

    # Evaluate on test set
    test_loss, test_accuracy, _, _ = evaluate_model(model, test_loader)
    history['test_loss'].append(test_loss)
    history['test_accuracy'].append(test_accuracy)

    print(f"Training Loss: {train_loss:.4f}")
    print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f}")



Epoch 1/5
------------------------------
Batch 86/430 | Loss: 0.6401


# Step  9: Plot Training History

In [None]:
plt.figure(figsize=(12, 5))

# Plot loss curves
plt.subplot(1, 2, 1)
plt.plot(history['train_loss'], label='Training Loss')
plt.plot(history['test_loss'], label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Test Loss')
plt.legend()

# Plot accuracy curve
plt.subplot(1, 2, 2)
plt.plot(history['test_accuracy'], label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Test Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# Step 10: Final Evaluation and Metrics

In [None]:
# Generate metrics on test data
model.eval()
_, _, all_labels, all_predictions = evaluate_model(model, test_loader)

# Create label names for the report
# Map the numeric labels back to sentiment names
reverse_sentiment_map = {v: k for k, v in sentiment_map.items()}
label_names = [reverse_sentiment_map.get(i, f"Class {i}") for i in range(len(sentiment_map))]

# Classification report
print("Classification Report:")
print(classification_report(all_labels, all_predictions, target_names=label_names))

# Confusion matrix
cm = confusion_matrix(all_labels, all_predictions)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_names,
            yticklabels=label_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()