In [None]:
# Cell 1: Install required packages
!pip install vaderSentiment transformers torch scikit-learn pandas numpy

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata 

In [None]:
# Cell 2: Import libraries
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import torch

In [None]:
# Cell 3: Load and prepare the dataset
# Load the dataset (assuming it's in CSV format with 'review' and 'sentiment' columns)
# If you're using Kaggle's IMDB dataset, it might be in a different format
# Adjust the path accordingly
df = pd.read_csv('/content/IMDB Dataset.csv')  # Replace with your file path

# Map sentiment labels to binary (0: negative, 1: positive)
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})

# Sample 5000 rows
df_sample = df.sample(n=5000, random_state=42).reset_index(drop=True)

print(f"Dataset shape: {df_sample.shape}")
print(f"Sentiment distribution:\n{df_sample['sentiment'].value_counts()}")

Dataset shape: (5000, 2)
Sentiment distribution:
sentiment
1    2519
0    2481
Name: count, dtype: int64


In [None]:
# Cell 4: VADER Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

def vader_predict(text):
    scores = analyzer.polarity_scores(text)
    return 1 if scores['compound'] >= 0 else 0

# Apply VADER to the sampled dataset
df_sample['vader_pred'] = df_sample['review'].apply(vader_predict)

# Calculate accuracy
vader_accuracy = accuracy_score(df_sample['sentiment'], df_sample['vader_pred'])
print(f"VADER Accuracy: {vader_accuracy:.4f}")

# Classification report
print("\nVADER Classification Report:")
print(classification_report(df_sample['sentiment'], df_sample['vader_pred']))

VADER Accuracy: 0.6964

VADER Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.53      0.63      2481
           1       0.65      0.86      0.74      2519

    accuracy                           0.70      5000
   macro avg       0.72      0.70      0.69      5000
weighted avg       0.72      0.70      0.69      5000



In [None]:
# Cell 5: BERT Sentiment Analysis
# Initialize BERT sentiment classifier
classifier = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    return_all_scores=False
)

def bert_predict(text):
    # Truncate text to prevent memory issues
    truncated_text = text[:512]
    result = classifier(truncated_text)
    return 1 if result[0]['label'] in ['POSITIVE', '5 stars', '4 stars'] else 0

# Apply BERT to a smaller sample (due to computation time)
# Using only 500 samples for BERT to save time
df_bert_sample = df_sample.sample(n=500, random_state=42)
df_bert_sample['bert_pred'] = df_bert_sample['review'].apply(bert_predict)

# Calculate accuracy
bert_accuracy = accuracy_score(df_bert_sample['sentiment'], df_bert_sample['bert_pred'])
print(f"BERT Accuracy (on 500 samples): {bert_accuracy:.4f}")

# Classification report
print("\nBERT Classification Report:")
print(classification_report(df_bert_sample['sentiment'], df_bert_sample['bert_pred']))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


BERT Accuracy (on 500 samples): 0.8020

BERT Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.93      0.83       256
           1       0.91      0.66      0.77       244

    accuracy                           0.80       500
   macro avg       0.82      0.80      0.80       500
weighted avg       0.82      0.80      0.80       500



In [None]:
# Cell 6: Logistic Regression with TF-IDF
# Prepare data for logistic regression
X = df_sample['review']
y = df_sample['sentiment']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train logistic regression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_vec, y_train)

# Make predictions
lr_pred = lr_model.predict(X_test_vec)

# Calculate accuracy
lr_accuracy = accuracy_score(y_test, lr_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")

# Classification report
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, lr_pred))

Logistic Regression Accuracy: 0.8450

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.80      0.84       506
           1       0.81      0.89      0.85       494

    accuracy                           0.84      1000
   macro avg       0.85      0.85      0.84      1000
weighted avg       0.85      0.84      0.84      1000



In [None]:
# Cell 7: Compare all models
print("Model Comparison:")
print(f"VADER Accuracy: {vader_accuracy:.4f}")
print(f"BERT Accuracy: {bert_accuracy:.4f}")
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")

Model Comparison:
VADER Accuracy: 0.6964
BERT Accuracy: 0.8020
Logistic Regression Accuracy: 0.8450
