# Download data

This is a dataset available via an api download from gensim directly.

There are other datasets available that you can try along with pretrained models.

https://github.com/piskvorky/gensim-data?tab=readme-ov-file


In [None]:
from gensim.models.word2vec import Word2Vec
import gensim
import gensim.downloader as api

corpus = api.load('text8')  # download the corpus and return it opened as an iterable
#text8 is the first 90 million characters from a Wikipedia dump/extract in March 2006


# Run Model

In [None]:
# DONT RUN ME IN LECTURE

#gsmodel = Word2Vec(corpus)  # simple call
gsmodel = gensim.models.Word2Vec(corpus, min_count=1, vector_size=100, window=5, sg=1) # sg=1 for skip-gram or 0 for CBOW

In [None]:
# print an embedding
print(gsmodel.wv['car'])

# **STOP HERE AND GO BACK TO SLIDES**

# Reduce dimensions and plot embeddings

### MDS Plot

In [None]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

# get frequent terms to plot
terms = gsmodel.wv.index_to_key[100:200] #didn't get the most frequent to avoid stopwords

termVectors = np.array([gsmodel.wv[word] for word in terms])
#termVectors.shape #100 words, 100 dimensions 

# Compute the pairwise Euclidean distances using pdist
distance_matrix = pdist(termVectors, metric='euclidean')
distance_matrix_square = squareform(distance_matrix) # Convert the distance vector to a square distance matrix
#print("Distance Matrix:\n", distance_matrix_square)

# Run MDS using the distance matrix
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
embedding = mds.fit_transform(distance_matrix_square)

# Plot the MDS result
plt.scatter(embedding[:, 0], embedding[:, 1])
for i, (x, y) in enumerate(embedding):
    plt.text(x, y, terms[i], fontsize=12, ha='right')
plt.title("MDS 2D Representation")
plt.show()

### PCA Plot

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# get frequent terms to plot
terms = gsmodel.wv.index_to_key[200:300] #didn't get the most frequent to avoid stopwords

termVectors = np.array([gsmodel.wv[word] for word in terms])
#termVectors.shape #100 words, 100 dimensions 

pca = PCA() # init PCA object
pca.fit(termVectors) # the fit function determines the new dimensions or axes to represent the data -- the result is sent back to the pca object

#transform the data
result = pca.transform(termVectors)


In [None]:
plt.figure()
plt.scatter(result[:,0], result[:,1])
for i, word in enumerate(terms):
  plt.annotate(word, xy=(result[i, 0], result[i, 1]))

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# Explore Results

### Most/Least similar terms based on Word Embedding

In [None]:
gsmodel.wv.most_similar(positive=['car'], topn=10)

In [None]:
gsmodel.wv.most_similar(negative=['italy'], topn=10)

### Identify similarity based on Cosine similarity

In [None]:
# choose the words you want to compare
baseW = 'car' #Choose a base word
simW = 'truck' #Choose a word that might be similar to the base word
diffW = 'bike' #Choose a word that might be different from the base word

# print comparisons
print("The cosine similarity between ", baseW, " and ", simW, " is ", gsmodel.wv.similarity(baseW, simW))

print("The cosine similarity between ", baseW, " and ", diffW, " is ", gsmodel.wv.similarity(baseW, diffW))


### Vector arithmetic for analogies

In [None]:
# subtract man from king then add woman
print(gsmodel.wv.most_similar(positive=['king','woman'], negative=['man'], topn=3))

In [None]:
# subtract he from she
gsmodel.wv.most_similar(positive=['she'], negative=['he'])

In [None]:
# subtract she from he
gsmodel.wv.most_similar(positive=['he'], negative=['she'])

In [None]:
gsmodel.wv.most_similar(positive=['she','her','hers','herself','he','him','his','himself'], topn=50)