# Word2Vec + SVM

Binary sentiment classification using pre-trained Word2Vec embeddings (mean pooling) with an RBF SVM.

Prerequisite: run `00_preprocessing.ipynb` first.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## Load data and split

In [None]:
df = pd.read_csv('data/reviews_extraidas.csv')
df['label'] = (df['Y'] > 5).astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    df['X'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

## Word2Vec embeddings

In [None]:
!pip install -q gensim

In [None]:
import gensim.downloader

w2v = gensim.downloader.load('word2vec-google-news-300')
print(f"Embedding dim: {w2v.vector_size}")

In [None]:
def text_to_embedding(text, model, dim=300):
    words = text.lower().split()
    vectors = [model[w] for w in words if w in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

X_train_w2v = np.array([text_to_embedding(t, w2v) for t in X_train])
X_test_w2v = np.array([text_to_embedding(t, w2v) for t in X_test])

print(f"Train shape: {X_train_w2v.shape}")
print(f"Test shape:  {X_test_w2v.shape}")

## Train SVM

In [None]:
svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm.fit(X_train_w2v, y_train)

## Evaluation

In [None]:
y_pred = svm.predict(X_test_w2v)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 (macro): {f1_score(y_test, y_pred, average='macro'):.4f}")
print(f"F1 (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")
print()
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix — Word2Vec + SVM')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

## With class balancing

In [None]:
svm_balanced = SVC(kernel='rbf', C=1.0, gamma='scale', class_weight='balanced', random_state=42)
svm_balanced.fit(X_train_w2v, y_train)

y_pred_bal = svm_balanced.predict(X_test_w2v)

print(f"Accuracy: {accuracy_score(y_test, y_pred_bal):.4f}")
print(f"F1 (macro): {f1_score(y_test, y_pred_bal, average='macro'):.4f}")
print(f"F1 (weighted): {f1_score(y_test, y_pred_bal, average='weighted'):.4f}")
print()
print(classification_report(y_test, y_pred_bal, target_names=['Negative', 'Positive']))

In [None]:
cm_bal = confusion_matrix(y_test, y_pred_bal)

plt.figure(figsize=(6, 5))
sns.heatmap(cm_bal, annot=True, fmt='d', cmap='Oranges',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix — Word2Vec + SVM (balanced)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

## Inference on new reviews

In [None]:
samples = [
    "This movie was absolutely amazing! I loved every moment of it.",
    "Terrible film. Waste of time and money. Do not watch.",
    "It was okay, nothing special but not bad either.",
]

X_samples = np.array([text_to_embedding(t, w2v) for t in samples])
preds = svm_balanced.predict(X_samples)
label_map = {0: 'Negative', 1: 'Positive'}

for text, pred in zip(samples, preds):
    print(f"{label_map[pred]:>8s}  |  {text}")