<a href="https://colab.research.google.com/github/originalantoniohernandez-source/stathub-pro/blob/main/Word_Embeddings_LLM_Lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ======================================================
# 1Ô∏è‚É£ Install dependencies
# ======================================================
!pip install -q gensim scikit-learn plotly ipywidgets

# Enable widgets support (important for Colab UI)
from google.colab import output
output.enable_custom_widget_manager()

# ======================================================
# 2Ô∏è‚É£ Load lightweight embedding model
# ======================================================
from gensim import downloader
import numpy as np

print("‚è≥ Loading GloVe 100D model (‚âà130MB, may take 1‚Äì2 min)...")
model = downloader.load("glove-wiki-gigaword-100")
print(f"‚úÖ Loaded {len(model.index_to_key):,} words with {model.vector_size}-D vectors.\n")

# ======================================================
# 3Ô∏è‚É£ Define semantic groups
# ======================================================
categories = {
    "Royalty / Gender": ["king", "queen", "man", "woman", "prince", "princess", "boy", "girl"],
    "Countries / Capitals": ["france", "paris", "italy", "rome", "germany", "berlin", "spain", "madrid"],
    "Animals / Habitats": ["dog", "wolf", "cat", "lion", "tiger", "forest", "house", "zoo"],
    "Technology": ["computer", "internet", "ai", "robot", "software", "hardware", "data", "science"],
}

# Gather valid words and vectors
words, vectors, labels = [], [], []
for cat, wordlist in categories.items():
    for w in wordlist:
        if w in model.key_to_index:
            words.append(w)
            vectors.append(model[w])
            labels.append(cat)

vectors = np.array(vectors)

# ======================================================
# 4Ô∏è‚É£ Reduce to 3D using PCA
# ======================================================
from sklearn.decomposition import PCA

pca = PCA(n_components=3, random_state=42)
reduced = pca.fit_transform(vectors)

# ======================================================
# 5Ô∏è‚É£ Interactive 3D Visualization with Dropdown
# ======================================================
import plotly.graph_objects as go

color_map = {
    "Royalty / Gender": "purple",
    "Countries / Capitals": "green",
    "Animals / Habitats": "orange",
    "Technology": "blue"
}

traces = []
for cat in categories.keys():
    indices = [i for i, lbl in enumerate(labels) if lbl == cat]
    trace = go.Scatter3d(
        x=reduced[indices, 0],
        y=reduced[indices, 1],
        z=reduced[indices, 2],
        mode="markers+text",
        text=[words[i] for i in indices],
        textposition="top center",
        marker=dict(size=6, color=color_map[cat], opacity=0.8),
        name=cat,
        visible=(cat == "Royalty / Gender")
    )
    traces.append(trace)

# Dropdown buttons
buttons = []
for i, cat in enumerate(categories.keys()):
    visible = [False] * len(categories)
    visible[i] = True
    buttons.append(dict(label=cat, method="update", args=[{"visible": visible}]))

buttons.append(dict(label="Show All", method="update", args=[{"visible": [True]*len(categories)}]))

fig = go.Figure(data=traces)
fig.update_layout(
    title="üåê 3D Word Embeddings Visualization (with Category Dropdown)",
    scene=dict(xaxis_title="PC1", yaxis_title="PC2", zaxis_title="PC3"),
    updatemenus=[dict(active=0, buttons=buttons, x=1.15, y=0.9)],
    height=700
)
fig.show()

# ======================================================
# 6Ô∏è‚É£ Analogy Demonstrations
# ======================================================
print("\nüß© Analogy Tests:")
tests = [
    ("king", "man", "woman"),
    ("paris", "france", "italy"),
    ("dog", "puppy", "kitten"),
    ("computer", "software", "hardware"),
]

for pos1, neg, pos2 in tests:
    try:
        result = model.most_similar(positive=[pos1, pos2], negative=[neg], topn=3)
        print(f"{pos1} - {neg} + {pos2} ‚âà {result[0][0]} ({result[0][1]:.3f})")
    except KeyError:
        print(f"‚ö†Ô∏è Missing word in '{pos1}, {neg}, {pos2}'")

# ======================================================
# 7Ô∏è‚É£ Interactive Cosine Similarity Explorer
# ======================================================
from ipywidgets import interact, Text

def similarity_explorer(word1, word2):
    word1, word2 = word1.lower().strip(), word2.lower().strip()
    if word1 in model.key_to_index and word2 in model.key_to_index:
        sim = model.similarity(word1, word2)
        print(f"\nüîç Cosine similarity between '{word1}' and '{word2}': {sim:.4f}")
        if sim > 0.7:
            print("üü¢ Very similar meanings.")
        elif sim > 0.4:
            print("üü° Somewhat related.")
        else:
            print("üî¥ Meanings are quite different.")
    else:
        print("‚ö†Ô∏è One or both words not found in vocabulary.")

print("\nüí¨ Type any two words to explore their semantic similarity:")
interact(similarity_explorer, word1=Text(value="king"), word2=Text(value="queen"));


‚è≥ Loading GloVe 100D model (‚âà130MB, may take 1‚Äì2 min)...
‚úÖ Loaded 400,000 words with 100-D vectors.




üß© Analogy Tests:
king - man + woman ‚âà queen (0.770)
paris - france + italy ‚âà rome (0.819)
dog - puppy + kitten ‚âà cat (0.694)
computer - software + hardware ‚âà computers (0.787)

üí¨ Type any two words to explore their semantic similarity:


interactive(children=(Text(value='king', description='word1'), Text(value='queen', description='word2'), Outpu‚Ä¶