In [37]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset, DataLoader

import sys
print(f"Python: {sys.executable}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"Current device: {torch.cuda.current_device()}")

    # Test if we can create tensors on GPU
    x = torch.tensor([1, 2, 3]).cuda()
    print(f"Tensor on GPU: {x.device}")
else:
    print("❌ No GPU detected!")

Python: C:\Program Files\Python311\python.exe
CUDA available: False
GPU count: 0
❌ No GPU detected!


In [21]:
cleaned_data = pd.read_csv('../cleaned_data.csv')
cleaned_data = cleaned_data.dropna()

# Check data distribution
sentiment_count = cleaned_data['sentiment'].value_counts()
total = len(cleaned_data)
lowest_emotion_num = sentiment_count.min()
imbalance = lowest_emotion_num / total

print(f"Data loaded: {total} comments")
print(f"Sentiment distribution: {sentiment_count.to_dict()}")
print(f"Class imbalance ratio: {imbalance:.3f}")

Data loaded: 152070 comments
Sentiment distribution: {'negative': 60020, 'positive': 55872, 'neutral': 36178}
Class imbalance ratio: 0.238


In [22]:
print("\nSetting up custom BERT model...")

# Choose BERT model
model_name = "bert-base-uncased"  # Standard BERT

try:
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3  # 3 classes: negative, neutral, positive
    )

    print(f"{model_name} loaded successfully!")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

except Exception as e:
    print(f"Error loading model: {e}")
    print("Make sure you have internet connection for first-time download.")



Setting up custom BERT model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bert-base-uncased loaded successfully!
Model parameters: 109,484,547


In [24]:
print("\nPreparing data for BERT training...")

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(cleaned_data['sentiment'])

print(f"Label mapping: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_data['comment_text'], y_encoded,
    test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set: {len(X_train)} comments")
print(f"Testing set: {len(X_test)} comments")



Preparing data for BERT training...
Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Training set: 121656 comments
Testing set: 30414 comments


In [26]:
print("\nCreating custom dataset for BERT...")

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = SentimentDataset(X_train, y_train, tokenizer)
test_dataset = SentimentDataset(X_test, y_test, tokenizer)

print(f" Training dataset: {len(train_dataset)} samples")
print(f" Testing dataset: {len(test_dataset)} samples")


Creating custom dataset for BERT...
 Training dataset: 121656 samples
 Testing dataset: 30414 samples


In [28]:
print("\nTraining BERT model...")

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert_sentiment_model",
    num_train_epochs=2,              # Reduced from 3 to 2 epochs (GPU is fast!)
    per_device_train_batch_size=16,  # Increased from 8 to 16 (GPU can handle more)
    per_device_eval_batch_size=16,   # Increased from 8 to 16
    warmup_steps=200,                # Reduced warmup (GPU training is more stable)
    weight_decay=0.01,               # Weight decay for regularization
    logging_dir="./logs",
    logging_steps=50,                # More frequent logging
    eval_steps=500,                  # Evaluate every 500 steps
    save_steps=500,                  # Save every 500 steps
    learning_rate=3e-5,              # Slightly higher learning rate for GPU
    save_total_limit=2,
    dataloader_pin_memory=True,      # Enable pin_memory for GPU
    fp16=True,                       # Enable mixed precision (faster training)
    gradient_accumulation_steps=2,   # Effective batch size = 16 * 2 = 32
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
print("Starting BERT training...")
trainer.train()

print("BERT training completed!")



Training BERT model...
Starting BERT training...


  return forward_call(*args, **kwargs)


KeyboardInterrupt: 

In [8]:
print("\nEvaluating BERT model...")

# Evaluate on test set
eval_results = trainer.evaluate()
print(f"BERT Test Accuracy: {eval_results['eval_accuracy']:.4f} ({eval_results['eval_accuracy']*100:.2f}%)")

# Make predictions on test set
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = y_test

# Calculate accuracy
bert_accuracy = accuracy_score(y_true, y_pred)
print(f"BERT Final Accuracy: {bert_accuracy:.4f} ({bert_accuracy*100:.2f}%)")

# Classification report
print(f"\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

# Confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print(f"\nConfusion Matrix:")
print(conf_matrix)


🔍 Extracting BERT features...


  return forward_call(*args, **kwargs)


BERT features shape: (100, 768)
Each text is represented by 768 features


In [9]:
print("\n📊 Comparing BERT with traditional methods...")

# Your actual traditional model results
traditional_results = {
    "SVM + TF-IDF": 0.9122,
    "Random Forest": 0.82,
    "XGBoost": 0.8062,
    "Naive Bayes": 0.7078,
    "Custom BERT": bert_accuracy  # Your actual BERT result
}

print("Performance Comparison:")
print("=" * 50)
for method, accuracy in traditional_results.items():
    print(f"{method:20} | {accuracy:.4f} ({accuracy*100:.2f}%)")


📊 Comparing BERT with traditional methods...
Performance Comparison:
SVM + TF-IDF         | 0.9122 (91.22%)
Random Forest        | 0.8200 (82.00%)
XGBoost              | 0.8062 (80.62%)
Naive Bayes          | 0.7078 (70.78%)

BERT Expected Performance:
Pre-trained BERT     | 0.85-0.90 (85-90%)
Fine-tuned BERT      | 0.95-0.98 (95-98%)
