# Phase 4: Embedding Analysis

**Objective:** Compare embedding representations (Word2Vec vs Transformer) and visualize class separability.

In [None]:
%load_ext autoreload
%autoreload 2
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import umap

# Add src
sys.path.append(os.path.abspath("../src"))
from embeddings import EmbeddingGenerator
from dl_models import DLManager

## 1. Load Data Sample
We use a subset for visualization to keep it fast.

In [None]:
data_path = Path("../data/processed_corpus_balanced.csv")
df = pd.read_csv(data_path)
df = df.dropna(subset=['clean_text', 'sentiment_score'])

# Sample 5000 points
df_sample = df.sample(n=5000, random_state=42)
texts = df_sample['clean_text'].astype(str).tolist()
labels = df_sample['sentiment_score'].tolist()
print(f"Sample size: {len(df_sample)}")

## 2. Generate Transformer Embeddings (S-BERT)

In [None]:
emb_gen = EmbeddingGenerator(model_name='all-MiniLM-L6-v2')
embeddings_bert = emb_gen.generate(texts)

## 3. Visualize with UMAP

In [None]:
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
embedding_2d = reducer.fit_transform(embeddings_bert)

plt.figure(figsize=(10, 8))
sns.scatterplot(
    x=embedding_2d[:, 0],
    y=embedding_2d[:, 1],
    hue=labels,
    palette='viridis',
    alpha=0.6
)
plt.title('UMAP Projection of S-BERT Embeddings')
plt.show()