part 1
- Use a pretrained word2vec model (example - `word2vec-google-news-300` )
- Pick any 5 words of your choice and find the model similar words for each of these 5 words.
- Just like the experiment from the lecture where we checked `king - man + woman ~= queen`  - come up with 2-3 similar examples and test them with the pre-trained word2vec model’s vectors.


In [1]:

!pip install gensim scikit-learn tqdm




In [2]:
import gensim.downloader as api

# Load GloVe Twitter embeddings
model = api.load("glove-twitter-200")

# Task 1: Find similar words
words = ['coffee', 'music', 'happy', 'school', 'football']

print("Task 1: Similar Words\n")
for word in words:
    print(f"Top 5 similar words to '{word}':")
    try:
        for similar_word, score in model.most_similar(word, topn=5):
            print(f"  {similar_word} (similarity: {score:.4f})")
    except KeyError:
        print(f"  '{word}' not found in vocabulary.")
    print()

Task 1: Similar Words

Top 5 similar words to 'coffee':
  tea (similarity: 0.7670)
  starbucks (similarity: 0.7393)
  coffe (similarity: 0.7360)
  beer (similarity: 0.7237)
  drink (similarity: 0.7041)

Top 5 similar words to 'music':
  songs (similarity: 0.7505)
  song (similarity: 0.7472)
  listen (similarity: 0.7371)
  listening (similarity: 0.7023)
  radio (similarity: 0.6813)

Top 5 similar words to 'happy':
  birthday (similarity: 0.8999)
  day (similarity: 0.8071)
  bday (similarity: 0.7735)
  wish (similarity: 0.7572)
  merry (similarity: 0.7265)

Top 5 similar words to 'school':
  college (similarity: 0.7961)
  class (similarity: 0.7617)
  tomorrow (similarity: 0.7286)
  high (similarity: 0.7116)
  kids (similarity: 0.7063)

Top 5 similar words to 'football':
  soccer (similarity: 0.8489)
  basketball (similarity: 0.7916)
  sports (similarity: 0.7718)
  players (similarity: 0.7588)
  baseball (similarity: 0.7572)



In [3]:
import gensim.downloader as api

# Load GloVe Twitter embeddings
model = api.load("glove-twitter-200")

# Define analogies
analogy_examples = [
    ("doctor", "hospital", "school"),     # → teacher
    ("basketball", "court", "ring"),      # → boxing
    ("iphone", "apple", "samsung"),       # → galaxy
    ("batman", "gotham", "krypton"),      # → superman
]

print(" Better Analogy Test Results:\n")

for a, b, c in analogy_examples:
    try:
        result = model.most_similar(positive=[a, c], negative=[b], topn=1)
        print(f"{a} - {b} + {c} = {result[0][0]} (similarity: {result[0][1]:.4f})")
    except KeyError as e:
        print(f" One of the words is missing in vocabulary: {e}")


 Better Analogy Test Results:

doctor - hospital + school = teacher (similarity: 0.6118)
basketball - court + ring = rings (similarity: 0.6044)
iphone - apple + samsung = galaxy (similarity: 0.7864)
batman - gotham + krypton = superman (similarity: 0.4571)


### Part 2

Build a movie review sentiment classifier using `WordVectors`

- Dataset: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/data
- Tasks:
    1. Perform text EDA
    2. Clean the text - remove noisy tokens like punctuations and stopwords
    3. Train an ML model of your choice using:
        1. A pre-trained W2V model’s vector (pick any model from the web)
        2. Custom Skip-gram vectors
        3. Custom CBoW vectors
        4. Custom FastText vectors
    4. Tabulate the model performance stats.

In [6]:
#1.imports
import pandas as pd
import numpy as np
import re
import nltk
import gensim
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from nltk.corpus import stopwords
from gensim.models import Word2Vec, FastText, KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm

tqdm.pandas()
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

# 2. Load Data
df = pd.read_csv("IMDB Dataset.csv")
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# 3. Clean Text
def clean_text(text):
    text = re.sub("<.*?>", " ", text)  # HTML
    text = re.sub("[^a-zA-Z]", " ", text)  # Non-letter chars
    tokens = text.lower().split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

df["cleaned_tokens"] = df["review"].progress_apply(clean_text)


100%|██████████| 50000/50000 [00:04<00:00, 11516.56it/s]


In [8]:
# 4. Helper to convert tokens to average vector
def get_avg_vector(tokens, model, dim):
    vec = np.zeros(dim)
    count = 0
    for word in tokens:
        if word in model:
            vec += model[word]
            count += 1
    return vec / count if count > 0 else vec

def build_features(df, model, dim):
    return np.vstack(df["cleaned_tokens"].progress_apply(lambda x: get_avg_vector(x, model, dim)))


In [9]:
# 5. Train + Evaluate Classifier
def train_and_evaluate(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    return accuracy_score(y_test, clf.predict(X_test))


In [15]:

from gensim.downloader import load

print("Loading GloVe pretrained vectors...")
w2v_pretrained = load("glove-wiki-gigaword-100")  # 100-dimensional GloVe
X_w2v = build_features(df, w2v_pretrained, 100)
acc_w2v = train_and_evaluate(X_w2v, df['label'])
print("Pretrained GloVe Accuracy:", acc_w2v)


Loading GloVe pretrained vectors...


100%|██████████| 50000/50000 [00:12<00:00, 3943.31it/s]


Pretrained GloVe Accuracy: 0.7983


In [16]:
# 7. Custom Skip-Gram
print("Training custom Skip-Gram model...")
w2v_sg = Word2Vec(sentences=df["cleaned_tokens"], vector_size=100, window=5, min_count=5, sg=1, workers=4)
X_sg = build_features(df, w2v_sg.wv, 100)
acc_sg = train_and_evaluate(X_sg, df['label'])
print("Custom Skip-Gram accuracy:", acc_sg)

Training custom Skip-Gram model...


100%|██████████| 50000/50000 [00:12<00:00, 4016.66it/s]


Custom Skip-Gram accuracy: 0.8774


In [17]:
# 8. Custom CBOW
print("Training custom CBOW model...")
w2v_cbow = Word2Vec(sentences=df["cleaned_tokens"], vector_size=100, window=5, min_count=5, sg=0, workers=4)
X_cbow = build_features(df, w2v_cbow.wv, 100)
acc_cbow = train_and_evaluate(X_cbow, df['label'])
print("Custom CBOW accuracy:", acc_cbow)

Training custom CBOW model...


100%|██████████| 50000/50000 [00:12<00:00, 3858.19it/s]


Custom CBOW accuracy: 0.8657


In [18]:
# 9. FastText
print("Training FastText model...")
fasttext_model = FastText(sentences=df["cleaned_tokens"], vector_size=100, window=5, min_count=5, workers=4)
X_ft = build_features(df, fasttext_model.wv, 100)
acc_ft = train_and_evaluate(X_ft, df['label'])
print("FastText accuracy:", acc_ft)

# 10. Summary Table
results = pd.DataFrame({
    "Model": ["Pretrained Word2Vec", "Custom Skip-Gram", "Custom CBOW", "Custom FastText"],
    "Accuracy": [acc_w2v, acc_sg, acc_cbow, acc_ft]
})

print("\n Performance Summary:")
print(results)

Training FastText model...


100%|██████████| 50000/50000 [00:16<00:00, 2982.78it/s]


FastText accuracy: 0.8512

 Performance Summary:
                 Model  Accuracy
0  Pretrained Word2Vec    0.7983
1     Custom Skip-Gram    0.8774
2          Custom CBOW    0.8657
3      Custom FastText    0.8512
