In [None]:
import pandas as pd
import spacy
from gensim.models import Word2Vec
from tqdm import tqdm
import numpy as np

In [None]:
# 1. Load CSV
df = pd.read_csv("../../data/En-Ba-Dataset(20k_4)/dataset_cleaned.csv")
texts = df["Sentence"].astype(str).tolist()

In [None]:
# 2. Load spaCy tokenizer
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "tagger"])


def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_space]

In [None]:
# 3. Tokenize 
sentences = [tokenize(t) for t in tqdm(texts, desc="Tokenizing")]

In [None]:
# 4. Train Word2Vec model

w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=300,
    window=5,
    min_count=5,
    workers=4,
    sg=1,
)

In [None]:
# 5. Sentence embeddings (mean of word vectors)
def sentence_embedding(tokens, model, dim=300):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if not vectors:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

In [None]:
# 6. Embeddings for all sentences
sentence_vectors = [
    sentence_embedding(s, w2v_model, 300) for s in tqdm(sentences, desc="Embedding Sentences")
]

In [None]:
# 7. Save to CSV (expanded columns)
sentence_embeddings_df = pd.DataFrame(sentence_vectors)
sentence_embeddings_df.to_csv("embeddings.csv", index=False)