# Experiment 8 — Word2Vec and GloVe on Local Gujarati Corpus
**Objective:** Implement Word2Vec and GloVe, analyze semantic capture for Gujarati, and compare both methods.

**Generated:** 2025-10-30T08:14:28.407709

---

**Notes:** This notebook trains embeddings on your local corpus directory. Edit `corpus_dir` if needed.


In [4]:
!pip install glove-python==0.1.0


Collecting glove-python==0.1.0
  Using cached glove_python-0.1.0.tar.gz (263 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: glove-python
  Building wheel for glove-python (pyproject.toml): started
  Building wheel for glove-python (pyproject.toml): finished with status 'error'
Failed to build glove-python


  error: subprocess-exited-with-error
  
  × Building wheel for glove-python (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [183 lines of output]
      !!
      
              ********************************************************************************
              Please remove any references to `setuptools.command.test` in all supported versions of the affected package.
      
              This deprecation is overdue, please update your project and remove deprecated
              calls to avoid build errors in the future.
              ********************************************************************************
      
      !!
      !!
      
              ********************************************************************************
              Usage of dash-separated 'description-file' will not be supported in future
              versions. Please use the underscore name 'description_file' instead.
              (Affected: glove_python).
      
    

In [None]:
# Install required packages (uncomment if needed)
# If your environment already has gensim and glove, these installs will be skipped or are quick.
!pip install -q gensim scikit-learn matplotlib seaborn nltk

import os
import io
import nltk
nltk.download('punkt', quiet=True)

from gensim.models import Word2Vec
from glove import Corpus, Glove
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
print('Imports ready')


ERROR: Could not find a version that satisfies the requirement glove-python-binary (from versions: none)

[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for glove-python-binary


ModuleNotFoundError: No module named 'gensim'

In [None]:
# ------------- USER: set this to your local folder -------------
corpus_dir = r"X:/DJ Sanghvi/sem 7/nlp/NLP_LAB_GYANGUJ/data/next"
# ----------------------------------------------------------------

# Load all .txt files and create a list of tokenized sentences
texts = []
for fn in os.listdir(corpus_dir):
    if fn.endswith('.txt'):
        path = os.path.join(corpus_dir, fn)
        with open(path, 'r', encoding='utf-8') as f:
            texts.append(f.read())

raw = '\n'.join(texts)
print('Loaded characters:', len(raw))

# Simple sentence splitting: split on newline and punctuation
import re
sentences = []
for part in re.split(r'[\n\.!?]+', raw):
    part = part.strip()
    if not part:
        continue
    # simple tokenization by whitespace (Gujarati-aware tokenizers can be added later)
    tokens = [t for t in part.split() if t]
    if len(tokens) > 0:
        sentences.append(tokens)

print('Prepared', len(sentences), 'sentences')
print('Example sentence tokens:', sentences[:3])


In [None]:
# Train Word2Vec (gensim)
w2v_params = dict(vector_size=100, window=5, min_count=2, workers=2, epochs=10)
print('Training Word2Vec with params:', w2v_params)
model_w2v = Word2Vec(sentences, **w2v_params)
print('Word2Vec vocab size:', len(model_w2v.wv))


In [None]:
# Train GloVe (glove-python-binary)
print('Building co-occurrence matrix for GloVe...')
corpus_model = Corpus()
corpus_model.fit(sentences, window=5)
print('Corpus dictionary size:', len(corpus_model.dictionary))

glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=20, no_threads=4, verbose=True)
glove.add_dictionary(corpus_model.dictionary)
print('GloVe trained. Vector size:', glove.no_components)


In [None]:
# Helper: get vector for a word from either model
import math

def get_w2v_vector(word):
    if word in model_w2v.wv:
        return model_w2v.wv[word]
    return None

def get_glove_vector(word):
    d = glove.dictionary
    if word in d:
        idx = d[word]
        return glove.word_vectors[idx]
    return None

# Nearest neighbors utility
from heapq import nsmallest

def topn_w2v(word, n=10):
    if word not in model_w2v.wv:
        return []
    return model_w2v.wv.most_similar(word, topn=n)

def topn_glove(word, n=10):
    vec = get_glove_vector(word)
    if vec is None:
        return []
    # compute cosine similarities with glove.word_vectors
    all_vecs = glove.word_vectors
    sims = cosine_similarity([vec], all_vecs)[0]
    idxs = np.argsort(-sims)[:n]
    inv_dict = {v:k for k,v in glove.dictionary.items()}
    return [(inv_dict[i], float(sims[i])) for i in idxs]

# Compare top-n neighbors overlap
def neighbor_overlap(word, n=10):
    w2 = [w for w,_ in topn_w2v(word,n)]
    gl = [w for w,_ in topn_glove(word,n)]
    if not w2 and not gl:
        return None
    set_w2, set_gl = set(w2), set(gl)
    overlap = set_w2.intersection(set_gl)
    return {
        'word': word,
        'w2v_top': w2,
        'glove_top': gl,
        'overlap_count': len(overlap),
        'overlap': list(overlap)
    }

print('Helpers ready')


In [None]:
# Demo words - edit as needed
demo_words = ['શાળા', 'શિક્ષણ', 'વિદ્યાર્થીઓ', 'શિક્ષક', 'મિત્ર']

for w in demo_words:
    print('\n===', w, '===')
    print('Word2Vec top:', topn_w2v(w, n=8))
    print('GloVe top:', topn_glove(w, n=8))
    print('Overlap summary:', neighbor_overlap(w, n=8))


In [None]:
# Visualize a small set of words with t-SNE
viz_words = []
for w in demo_words:
    if get_w2v_vector(w) is not None:
        viz_words.append(w)
    elif get_glove_vector(w) is not None:
        viz_words.append(w)
# add most frequent words
from collections import Counter
freq = Counter([w for s in sentences for w in s])
for w,_ in freq.most_common(20):
    if w not in viz_words:
        viz_words.append(w)
viz_words = viz_words[:40]

vecs = []
labels = []
for w in viz_words:
    v = get_w2v_vector(w)
    if v is None:
        v = get_glove_vector(w)
    if v is not None:
        vecs.append(v)
        labels.append(w)

if len(vecs) >= 2:
    tsne = TSNE(n_components=2, random_state=42, init='pca')
    reduced = tsne.fit_transform(np.array(vecs))
    plt.figure(figsize=(10,8))
    plt.scatter(reduced[:,0], reduced[:,1])
    for i, lab in enumerate(labels):
        plt.annotate(lab, (reduced[i,0], reduced[i,1]))
    plt.title('t-SNE plot of word vectors (mixed W2V/GloVe)')
    plt.show()
else:
    print('Not enough vectors to visualize')


In [None]:
# Simple intrinsic comparison: average overlap across demo words
results = []
for w in demo_words:
    r = neighbor_overlap(w, n=10)
    if r is not None:
        results.append(r['overlap_count'])

if results:
    print('Average top-10 neighbor overlap (Word2Vec vs GloVe):', sum(results)/len(results))
else:
    print('No comparison results (likely words not in vocab)')


In [None]:
# Save trained models for reuse
model_w2v.save('/mnt/data/w2v_model.model')
glove.save('/mnt/data/glove_model.model')
print('Saved Word2Vec and GloVe models to /mnt/data')


## Notes & next steps

- Word2Vec (gensim) and GloVe (glove-python-binary) were trained on your local Gujarati corpus.
- Quality heavily depends on corpus size and variety — Gujarati corpora are often small, so consider adding more text.
- For downstream tasks (clustering, similarity, analogies), use larger vector dimensions and more training epochs.
- If glove package installation fails, try: `pip install -q glove-python-binary` or use a Python reimplementation.

