## Word2Vec embedding

In [4]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Example text
sentences = [
    "I love natural language processing",
    "Word2Vec creates word embeddings",
    "Deep learning is amazing for NLP",
    "black and brown are colors"
]

# Tokenize (each sentence becomes a list of words)
tokenized = [word_tokenize(sent.lower()) for sent in sentences]

# Train Word2Vec model
model = Word2Vec(
    sentences=tokenized,
    vector_size=100,   # embedding dimension
    window=5,          # context window size
    min_count=1,       # ignore words with low frequency
    workers=4,         # parallel training
    sg=1               # 1 for skip-gram, 0 for CBOW
)

# Save model
model.save("word2vec.model")

# Load later
model = Word2Vec.load("word2vec.model")

In [5]:
# Get vector for a word
vector = model.wv["nlp"]
print(vector[:10])  # first 10 dims

# Find most similar words
print(model.wv.most_similar("nlp"))

# Check similarity between words
print(model.wv.similarity("nlp", "language"))
print(model.wv.similarity("brown", "black"))

[-0.00719094  0.00423289  0.00216339  0.00744071 -0.00488927 -0.00456435
 -0.00609817  0.00329937 -0.00449946  0.00852288]
[('are', 0.12300866842269897), ('creates', 0.08058694750070572), ('black', 0.06548557430505753), ('colors', 0.054333679378032684), ('language', 0.01606523059308529), ('learning', 0.013243247754871845), ('embeddings', 0.010695216245949268), ('i', 0.0007033531437627971), ('is', -0.0037013525143265724), ('word', -0.014541246928274632)]
0.016065232
0.05959967


In [14]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Example
print(model.most_similar("king"))
print(model.most_similar("university"))
print(model.similarity("university", "faculty"))
print(cosine_similarity(model["university"].reshape(1, -1), model["faculty"].reshape(1, -1)))

[('kings', 0.7138044834136963), ('queen', 0.6510957479476929), ('monarch', 0.6413194537162781), ('crown_prince', 0.6204219460487366), ('prince', 0.6159994602203369), ('sultan', 0.5864822864532471), ('ruler', 0.5797566175460815), ('princes', 0.5646552443504333), ('Prince_Paras', 0.5432944297790527), ('throne', 0.5422106385231018)]
[('universities', 0.7003917694091797), ('faculty', 0.6780906319618225), ('unversity', 0.6758289337158203), ('undergraduate', 0.6587095260620117), ('univeristy', 0.6585440039634705), ('campus', 0.6434986591339111), ('college', 0.6385268568992615), ('academic', 0.6317197680473328), ('professors', 0.6298648715019226), ('undergraduates', 0.6149813532829285)]
0.6780907
[[0.6780907]]


## GloVe embedding

In [9]:
!!wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip


["'wget' is not recognized as an internal or external command,",
 'operable program or batch file.']

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

texts = ['text', 'the', 'leader', 'prime', 'natural', 'language']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

print("Number of unique words in dictionary =", len(tokenizer.word_index))
print("Dictionary is =", tokenizer.word_index)

def embedding_for_vocab(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # +1 for padding token (index 0)
    embedding_matrix_vocab = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix_vocab[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix_vocab

embedding_dim = 50 # match this with glove file
glove_path = ".\glove-6B-300d.txt"
embedding_matrix_vocab = embedding_for_vocab(glove_path, tokenizer.word_index, embedding_dim)

first_word_index = 1  # Tokenizer indexes start from 1
print("Dense vector for word with index 1 =>", embedding_matrix_vocab[first_word_index])

Number of unique words in dictionary = 6
Dictionary is = {'text': 1, 'the': 2, 'leader': 3, 'prime': 4, 'natural': 5, 'language': 6}
Dense vector for word with index 1 => [-0.81660002 -0.27568999 -0.32951    -0.19751    -0.13119     0.58964998
 -0.47442999 -0.20592999  0.32954001 -1.58150005  0.22524001  0.19357
  0.26813     0.18815     0.20840999 -0.2043     -0.25769001 -0.0095931
  0.020479   -0.4172     -0.28132999 -0.3998     -0.16343001  0.59351999
 -0.053312   -0.073362    0.39872    -0.053993    0.62893999 -0.040527
  0.20638999  0.31568    -0.62093002  0.74989003 -0.79408002 -0.30658001
  0.25953001 -0.74764001 -0.25007999  0.28804001 -0.19340999 -0.046537
 -0.47253999  0.080808    0.045168    0.3531     -0.15433     0.26328999
 -0.66061997 -0.63936001]


In [25]:
import numpy as np

# Load GloVe file
def load_glove_embeddings(path):
    embeddings_index = {}
    with open(path, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = vector
    return embeddings_index

embeddings_index = load_glove_embeddings(glove_path)

print("Words loaded:", len(embeddings_index))
print("Vector for 'king':", embeddings_index["king"][:10])


Words loaded: 400000
Vector for 'king': [ 0.0033901 -0.34614    0.28144    0.48382    0.59469    0.012965
  0.53982    0.48233    0.21463   -1.0249   ]


In [26]:
from sklearn.metrics.pairwise import cosine_similarity

def similarity(word1, word2):
    v1 = embeddings_index[word1].reshape(1, -1)
    v2 = embeddings_index[word2].reshape(1, -1)
    return cosine_similarity(v1, v2)[0][0]

print("Similarity(king, queen):", similarity("king", "queen"))
print("Similarity(dog, cat):", similarity("dog", "cat"))


Similarity(king, queen): 0.63364697
Similarity(dog, cat): 0.6816747
