In [None]:
import pandas as pd
import fasttext
import numpy as np
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


In [None]:
# --- Load dataset ---
df = pd.read_csv("../../data/En-Ba-Dataset(20k_4)/dataset_cleaned.csv")
texts = df["Sentence"].astype(str).tolist()


In [None]:
# --- Prepare FastText custom corpus ---
with open("texts_for_ft.txt", "w", encoding="utf-8") as f:
    for line in texts:
        f.write(line + "\n")

In [None]:
# --- Load pretrained FastText and train custom ---
ft_en = fasttext.load_model("cc.en.300.bin")
ft_custom = fasttext.train_unsupervised(
    "texts_for_ft.txt", model="skipgram", dim=100, minn=3, maxn=6
)

In [None]:
# --- Load SBERT model for sentences ---
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
# --- Word vector fallback function ---
def get_word_vector(word):
    try:
        vec = ft_en.get_word_vector(word)
        if np.all(vec == 0):
            vec = ft_custom.get_word_vector(word)
    except:
        vec = ft_custom.get_word_vector(word)
    return vec

In [None]:
# --- Sentence to vector using SBERT with FastText fallback ---
def sentence_to_vector(text):
    try:
        # SBERT handles semantic embedding
        vec = sbert_model.encode(text)
        if np.all(vec == 0):
            raise ValueError("Empty SBERT vector, fallback")
    except:
        # fallback to FastText mean
        tokens = text.split()
        vecs = [get_word_vector(tok) for tok in tokens if tok.strip()]
        if len(vecs) == 0:
            return np.zeros(ft_custom.get_dimension())
        vec = np.mean(vecs, axis=0)
    return vec

In [None]:
# --- Compute sentence vectors ---
vectors = []

for t in tqdm(texts, desc="Computing sentence vectors"):
    vectors.append(sentence_to_vector(t))

vectors = np.array(vectors)
vectors_normalized = normalize(vectors, norm="l2")

In [None]:
# --- Save vectors and attach to DataFrame ---
np.save("vectors.npy", vectors_normalized)

# Save in original
# df["vector"] = list(vectors_normalized)
# df.to_pickle("df_with_vectors.pkl")

# Save in New
new_df = pd.DataFrame({"vector": list(vectors_normalized)})
# new_df.to_pickle("embeddings.pkl")
new_df.to_csv("embeddings.csv")