In [15]:
import pandas as pd
import csv
from zipfile import ZipFile

N_DIMS = 100
z = ZipFile("embeddings/glove6b/glove.6B.zip")
f = z.open(f'glove.6B.{N_DIMS}d.txt')

word_matrix = pd.read_table(
    f, sep=" ", index_col=0, 
    header=None, quoting=csv.QUOTE_NONE
)
word_matrix.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,-0.038194,-0.24487,0.72812,-0.39961,0.083172,0.043953,-0.39141,0.3344,-0.57545,0.087459,...,0.016215,-0.017099,-0.38984,0.87424,-0.72569,-0.51058,-0.52028,-0.1459,0.8278,0.27062
",",-0.10767,0.11053,0.59812,-0.54361,0.67396,0.10663,0.038867,0.35481,0.06351,-0.094189,...,0.34951,-0.7226,0.37549,0.4441,-0.99059,0.61214,-0.35111,-0.83155,0.45293,0.082577
.,-0.33979,0.20941,0.46348,-0.64792,-0.38377,0.038034,0.17127,0.15978,0.46619,-0.019169,...,-0.063351,-0.67412,-0.068895,0.53604,-0.87773,0.31802,-0.39242,-0.23394,0.47298,-0.028803
of,-0.1529,-0.24279,0.89837,0.16996,0.53516,0.48784,-0.58826,-0.17982,-1.3581,0.42541,...,0.18712,-0.018488,-0.26757,0.727,-0.59363,-0.34839,-0.56094,-0.591,1.0039,0.20664
to,-0.1897,0.050024,0.19084,-0.049184,-0.089737,0.21006,-0.54952,0.098377,-0.20135,0.34241,...,-0.13134,0.058617,-0.31869,-0.61419,-0.62393,-0.41548,-0.038175,-0.39804,0.47647,-0.15983


In [21]:
from sklearn.metrics.pairwise import cosine_similarity
word_list = ["dog","cat","carbon"]
words = word_matrix.loc[word_list]
sims = pd.DataFrame(cosine_similarity(words))
sims.index, sims.columns = word_list, word_list
sims

Unnamed: 0,dog,cat,carbon
dog,1.0,0.879808,0.090229
cat,0.879808,1.0,0.050274
carbon,0.090229,0.050274,1.0


In [22]:
# Distance between 2 words
from scipy.spatial.distance import cosine
vec_a = word_matrix.loc["paris"]
vec_b = word_matrix.loc["france"]
1 - cosine(vec_a, vec_b)

In [28]:
# Similar words to x
vec_a = word_matrix.loc["cat"]
sims = 1 - word_matrix.apply(cosine, axis=1, args=(vec_a,))
sims.sort_values(ascending=False).head(6)

In [34]:
# function for similar words to x
def similar_words(word, word_matrix):
    vec_a = word_matrix.loc[word]
    sims = 1 - word_matrix.apply(cosine, axis=1, args=(vec_a,))
    return sims.sort_values(ascending=False)

similar_words("carbon",word_matrix).head(6)

0
cat       1.000000
dog       0.879808
rabbit    0.742443
cats      0.732300
monkey    0.728871
dtype: float64

In [35]:
diff = word_matrix.loc["paris"] - word_matrix.loc["france"] 
vec_d = word_matrix.loc["berlin"] - diff
sims = 1 - word_matrix.apply(cosine, axis=1, args=(vec_d,))
sims.sort_values(ascending=False).head(6)

0
germany    0.892766
austria    0.762186
denmark    0.748199
poland     0.745510
berlin     0.722017
france     0.721111
dtype: float64

In [None]:
## Document embeddings

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
docs = [
  "The acclaimed author penned novels based on her life",
  "Nobel prize-winning writer writes autobiographical fiction"
]
vec = CountVectorizer()
dfmat = vec.fit_transform(docs).todense()
dfmat

matrix([[1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1]])

In [57]:
common_features

{'acclaimed',
 'author',
 'autobiographical',
 'based',
 'fiction',
 'her',
 'life',
 'nobel',
 'novels',
 'on',
 'penned',
 'prize',
 'the',
 'winning',
 'writer',
 'writes'}

In [56]:
import numpy as np
common_features = set(word_matrix.index) & set(vec.get_feature_names_out())
vocab_ids = [vec.vocabulary_[x] for x in common_features]
glove_dfmat = dfmat[:,vocab_ids]
corpus_word_matrix = word_matrix.loc[common_features,]
doc_matrix = np.inner(glove_dfmat, corpus_word_matrix.T)
1 - cosine(doc_matrix[0,], doc_matrix[1,])

0.7854257726317803