# 📘 IMDb Sentiment Analysis
This notebook implements sentiment classification on IMDb reviews using pre-trained Word2Vec embeddings and logistic regression.

In [None]:
# Step 1: Download required NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Step 2: Load IMDb dataset using Hugging Face datasets
from datasets import load_dataset, concatenate_datasets
dataset = load_dataset("imdb")
pos = dataset['train'].filter(lambda x: x['label'] == 1).select(range(2500))
neg = dataset['train'].filter(lambda x: x['label'] == 0).select(range(2500))
full_dataset = concatenate_datasets([pos, neg]).shuffle(seed=42)
texts = full_dataset['text']
labels = full_dataset['label']

In [None]:
# Step 3: Preprocess text and generate sentence embeddings
import numpy as np
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm

w2v = api.load("word2vec-google-news-300")
def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [t for t in tokens if t.isalpha() and t not in stopwords.words('english')]

def doc_vector(text):
    words = preprocess(text)
    vectors = [w2v[w] for w in words if w in w2v]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

X = np.array([doc_vector(t) for t in tqdm(texts, desc="Vectorizing")])
y = np.array(labels)
print("✅ Vectorization complete.")
print("X shape:", X.shape)

In [None]:
# Step 4: Train logistic regression model and evaluate performance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("📊 Evaluation Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

In [None]:
# Step 5: Visualize results
import matplotlib.pyplot as plt
scores = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1 Score': f1_score(y_test, y_pred)
}
plt.figure(figsize=(6,4))
plt.bar(scores.keys(), scores.values())
plt.ylim(0, 1)
plt.title("Sentiment Classifier Performance")
plt.ylabel("Score")
plt.grid(True, axis='y', linestyle='--', alpha=0.5)
plt.show()