In [8]:
import pickle
import pandas as pd
import plotly.express as px
from gensim.models import Word2Vec
from sklearn.decomposition import PCA

In [9]:
# 1. Load cleaned tokens
with open('../data/processed/clean_tokens.pkl', 'rb') as f:
    filtered_tokens = pickle.load(f)

In [10]:
# 2. Train Word2Vec (creates 100-dimensional vectors)
model = Word2Vec(sentences=[filtered_tokens], vector_size=100, window=5, min_count=1, workers=4)
words = list(model.wv.index_to_key)
vectors = [model.wv[word] for word in words]

In [11]:
# 3. PCA (Reduce 100D -> 2D)
pca = PCA(n_components=2)
components = pca.fit_transform(vectors)

In [12]:
# 4. Create DataFrame for Plotting
df_pca = pd.DataFrame(components, columns=['PC1', 'PC2'])
df_pca['word'] = words
df_pca['Language'] = df_pca['word'].apply(lambda x: 'English/Loanword' if x.lower().islower() else 'Mandarin')

In [13]:
# 5. Visualize
fig = px.scatter(
    df_pca, x='PC1', y='PC2', text='word', color='Language',
    title="Zack's Semantic Map (PCA)",
    template='plotly_white'
)
fig.update_traces(textposition='top center')
fig.show()

In [14]:
# 6. Export
fig.write_html("../data/processed/zack_pca_map.html")