In [None]:
#a fresh Python environment
!pip install --force-reinstall --no-cache-dir numpy==1.24.3 scipy==1.10.1 gensim==4.3.0

In [None]:
!pip[ install transformers]

In [1]:
# ============================================================================
# INSTALL ALL DEPENDENCIES
# Run this cell FIRST in Kaggle/Colab
# ============================================================================


# Note: These are usually pre-installed in Kaggle/Colab:
# - numpy
# - pandas  
# - scikit-learn
# - torch (PyTorch)

print("✓ Installation complete!")
print("\nInstalled packages:")
print("- gensim (Word2Vec, GloVe, FastText)")
print("- sentence-transformers (SBERT)")
print("\nYou can now run the embedding comparison code.")

✓ Installation complete!

Installed packages:
- gensim (Word2Vec, GloVe, FastText)
- sentence-transformers (SBERT)

You can now run the embedding comparison code.


 IMPORTS & SETUP

In [2]:
import re, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader


In [3]:
CSV_PATH = "/kaggle/input/reviews/reviews.csv"
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
print("Loading data...")
df = pd.read_csv(CSV_PATH).dropna(subset=["Text"]).reset_index(drop=True)
X_all = df["Text"].astype(str).tolist()
y_all = df["Sentiment"].astype(int).to_numpy()

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=SEED, stratify=y_all
)
print(f"✓ {len(X_train_text)} train | {len(X_test_text)} test\n")

Loading data...
✓ 40000 train | 10000 test



In [5]:
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(128, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc2(x)).squeeze(1)
        return x

In [6]:
def train_model(X_train, X_test, y_train, y_test):
    """Train and evaluate model"""
    # Convert to tensors
    X_tr = torch.tensor(X_train, dtype=torch.float32)
    X_te = torch.tensor(X_test, dtype=torch.float32)
    y_tr = torch.tensor(y_train, dtype=torch.float32)
    y_te = torch.tensor(y_test, dtype=torch.float32)
    
    # Setup model
    model = SimpleNN(X_train.shape[1]).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.BCELoss()
    
    # Training
    train_loader = DataLoader(TensorDataset(X_tr, y_tr), batch_size=64, shuffle=True)
    model.train()
    for epoch in range(6):
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = loss_fn(predictions, y_batch)
            loss.backward()
            optimizer.step()
    
    # Evaluation
    model.eval()
    with torch.no_grad():
        predictions = model(X_te.to(device)).cpu().numpy()
    y_pred = (predictions > 0.5).astype(int)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return acc, f1

In [7]:
print("=" * 60)
print("METHOD 1: COUNT VECTORIZER")
print("=" * 60)

vectorizer = CountVectorizer(max_features=5000, ngram_range=(1,1))
X_train_counts = vectorizer.fit_transform(X_train_text)
X_test_counts = vectorizer.transform(X_test_text)

# Reduce dimensions with SVD
svd = TruncatedSVD(n_components=300, random_state=SEED)
X_train_svd = svd.fit_transform(X_train_counts)
X_test_svd = svd.transform(X_test_counts)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_svd).astype(np.float32)
X_test_scaled = scaler.transform(X_test_svd).astype(np.float32)

acc, f1 = train_model(X_train_scaled, X_test_scaled, y_train, y_test)
print(f"Accuracy: {acc:.4f} | F1: {f1:.4f}\n")

METHOD 1: COUNT VECTORIZER
Accuracy: 0.8566 | F1: 0.8571



In [10]:
print("=" * 60)
print("METHOD 2: TF-IDF")
print("=" * 60)

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Reduce dimensions with SVD
svd = TruncatedSVD(n_components=300, random_state=SEED)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_svd).astype(np.float32)
X_test_scaled = scaler.transform(X_test_svd).astype(np.float32)

acc, f1 = train_model(X_train_scaled, X_test_scaled, y_train, y_test)
print(f"Accuracy: {acc:.4f} | F1: {f1:.4f}\n")

METHOD 2: TF-IDF
Accuracy: 0.8782 | F1: 0.8786



In [14]:
print("=" * 60)
print("METHOD 3: WORD2VEC (trained on your data)")
print("=" * 60)

from gensim.models import Word2Vec

def tokenize(text):
    """Simple tokenizer"""
    return re.findall(r"[a-z']+", text.lower())

# Train Word2Vec on training data
tokenized_train = [tokenize(text) for text in X_train_text]
w2v_model = Word2Vec(sentences=tokenized_train, vector_size=300, window=5, 
                     min_count=2, sg=1, epochs=10, workers=4)

def document_vector(text, model):
    """Average word vectors for a document"""
    tokens = tokenize(text)
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)

X_train_w2v = np.vstack([document_vector(text, w2v_model) for text in X_train_text]).astype(np.float32)
X_test_w2v = np.vstack([document_vector(text, w2v_model) for text in X_test_text]).astype(np.float32)

acc, f1 = train_model(X_train_w2v, X_test_w2v, y_train, y_test)
print(f"Accuracy: {acc:.4f} | F1: {f1:.4f}\n")

METHOD 3: WORD2VEC (trained on your data)
Accuracy: 0.8771 | F1: 0.8796



In [15]:
print("=" * 60)
print("METHOD 4: GLOVE (pretrained embeddings)")
print("=" * 60)

import gensim.downloader as api

print("Downloading GloVe model (this may take a minute)...")
glove_model = api.load("glove-wiki-gigaword-100")

def document_vector_glove(text, model):
    """Average GloVe vectors for a document"""
    tokens = tokenize(text)
    vectors = [model[word] for word in tokens if word in model]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)

X_train_glove = np.vstack([document_vector_glove(text, glove_model) for text in X_train_text]).astype(np.float32)
X_test_glove = np.vstack([document_vector_glove(text, glove_model) for text in X_test_text]).astype(np.float32)

acc, f1 = train_model(X_train_glove, X_test_glove, y_train, y_test)
print(f"Accuracy: {acc:.4f} | F1: {f1:.4f}\n")

METHOD 4: GLOVE (pretrained embeddings)
Downloading GloVe model (this may take a minute)...
Accuracy: 0.7896 | F1: 0.7990



In [16]:
print("=" * 60)
print("METHOD 5: FASTTEXT (pretrained embeddings)")
print("=" * 60)

print("Downloading FastText model (this may take a few minutes)...")
fasttext_model = api.load("fasttext-wiki-news-subwords-300")

def document_vector_fasttext(text, model):
    """Average FastText vectors for a document"""
    tokens = tokenize(text)
    vectors = [model[word] for word in tokens if word in model]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)

X_train_ft = np.vstack([document_vector_fasttext(text, fasttext_model) for text in X_train_text]).astype(np.float32)
X_test_ft = np.vstack([document_vector_fasttext(text, fasttext_model) for text in X_test_text]).astype(np.float32)

acc, f1 = train_model(X_train_ft, X_test_ft, y_train, y_test)
print(f"Accuracy: {acc:.4f} | F1: {f1:.4f}\n")

METHOD 5: FASTTEXT (pretrained embeddings)
Downloading FastText model (this may take a few minutes)...
Accuracy: 0.8440 | F1: 0.8404



In [17]:
print("=" * 60)
print("SUMMARY OF RESULTS")
print("=" * 60)
print("\nAll methods tested with same simple architecture:")
print("- 1 hidden layer (128 neurons)")
print("- 0.3 dropout")
print("- 6 epochs")
print("- Learning rate: 0.001")
print("\nCheck the accuracy & F1 scores above to see which embedding works best!")
print("\nNext step: Take the best embedding and try different architectures.")

SUMMARY OF RESULTS

All methods tested with same simple architecture:
- 1 hidden layer (128 neurons)
- 0.3 dropout
- 6 epochs
- Learning rate: 0.001

Check the accuracy & F1 scores above to see which embedding works best!

Next step: Take the best embedding and try different architectures.
