In [None]:
import numpy as np
import pandas as pd

#CMU pronunciation (phoneme) dictionary
import cmudict

#Natural Language Toolkit
import nltk
from nltk import word_tokenize
from nltk import pos_tag

import wikipedia
import requests
import scipy.spatial

Read the glove vectors from a txt file into a dictionary. This will take ~20 seconds

In [None]:
#read csv with pandas
df = pd.read_csv('glove.6B.300d.txt', sep=" ", quoting=3, header=None, index_col=0)

#format glove vector dictionary
glove_vecs = {key: val.values for key, val in df.T.items()}

To speed things up, filter the vectors to only include words with lexical frequency > 1.5

In [None]:
#read SUBTLEX excel file
subtlex = pd.read_excel('SUBTLEX.xlsx')

#get wordlist column
subtlex_wordlist=list(subtlex['Word'])

#get Log10 word frequency column
freqs=np.array(subtlex['Lg10WF'])

#set word frequency dict
word_freqs=dict(zip(subtlex_wordlist,freqs))

#filter words
keep_words=[word for word in glove_vecs if word in word_freqs and word_freqs[word]>1.5]

#reset glove_vecs
glove_vecs2=dict()
for word in keep_words:
    glove_vecs2[word]=glove_vecs[word]
glove_vecs=glove_vecs2

Check out a glove vector

In [None]:
print(glove_vecs['spooky'])

Find semantic neighbors with correlations

In [None]:
#get vector for 'spooky'
w1='spooky'
vec1=glove_vecs[w1]

#correlate vec1 with all other vectors in the set
corrs=[]
for word in glove_vecs:
    corr=np.corrcoef(vec1,glove_vecs[word])[0][1]
    corrs.append(corr)

#print top 10 neighbors
n=10
top10=[keep_words[i] for i in np.argsort(corrs)[::-1][:n]]
print(top10)

Visualize 10 word vectors as 2D coordinates with UMAP

In [None]:
import random
import umap
import matplotlib.pyplot as plt

#get vectors for 10 random words
random.shuffle(keep_words)
randvecs=[glove_vecs[w] for w in keep_words[:10]]


#Dimensionality reduction with UMAP (https://umap-learn.readthedocs.io/en/latest/basic_usage.html)
randvecs2d = umap.UMAP(n_neighbors=3,
                      min_dist=.1,
                      metric='euclidean').fit_transform(randvecs)

#plot each word based on 2d coordinates
for ind,coords in enumerate(randvecs2d):

    word=keep_words[ind]
    
    x=coords[0]
    y=coords[1]

    plt.text(x,y,word,fontsize=8)

#set x and y limits
plt.xlim(np.min([v[0] for v in randvecs2d])-1,np.max([v[0] for v in randvecs2d])+1)
plt.ylim(np.min([v[1] for v in randvecs2d])-1,np.max([v[1] for v in randvecs2d])+1)

plt.show()

Plot wikipedia topics in 2D by averaging the glove vectors of words in the wiki summaries

In [None]:
topics=['Halloween','Thanksgiving','Zebra','Giraffe','Episodic memory','Hippocampus']

topic_vectors=[]

for topic in topics:
    
    #get wiki page
    page = wikipedia.page(topic)

    #get summary (note: change 'summary' to 'content' to scrape the whole page)
    summary=page.summary
    
    #make lower case for searching vector dict
    summary=summary.lower()

    #tokenize
    tokens=word_tokenize(summary)

    #get vectors
    topic_vecs=[glove_vecs[token] for token in tokens if token in glove_vecs]

    #average vectors
    topic_mean=np.mean(topic_vecs,axis=0)

    #add to list of topic vectors
    topic_vectors.append(topic_mean)
    
    
#Dimensionality reduction with UMAP (https://umap-learn.readthedocs.io/en/latest/basic_usage.html)
randvecs2d = umap.UMAP(n_neighbors=3,
                      min_dist=.1,
                      metric='euclidean').fit_transform(topic_vectors)

#plot each word based on 2d coordinates
for ind,coords in enumerate(randvecs2d):

    topic=topics[ind]
    
    x=coords[0]
    y=coords[1]

    plt.text(x,y,topic,fontsize=8)

#set x and y limits
plt.xlim(np.min([v[0] for v in randvecs2d])-1,np.max([v[0] for v in randvecs2d])+1)
plt.ylim(np.min([v[1] for v in randvecs2d])-1,np.max([v[1] for v in randvecs2d])+1)

plt.show()

Or do the same thing with different subreddits instead of words. First define this function again

In [None]:
#function for generate the search URL from criteria
def get_url(search_type,criteria):   
    url='https://api.pushshift.io/reddit/search/' + search_type + '/?'
    for crit in criteria:
        if crit=='score':
            url=url + crit + criteria[crit] + '&'
        else:
            url=url + crit + '=' + criteria[crit] + '&'
    return url

Now generate lists of titles and their mean glove vectors for some different subreddits

In [None]:
subreddits=['LanguageTechnology','MachineLearning','NBA','NFL','Politics','WorldNews']

#initialize search criteria
criteria1=dict()

#type (comment or submission)
search_type='submission'

#start time (m (minute), h (hour), d (day))
criteria1['before']='0d'

#end time (m (minute), h (hour), d (day))
criteria1['after']='300d'

#size of results (max=1000)
criteria1['size']='1000'

#sort
criteria1['sort_type']='score'
criteria1['sort']='desc'

subreddit_vecs=[]
for subreddit in subreddits:

    #subreddits
    criteria1['subreddit']=subreddit

    #get urls
    url1=get_url(search_type,criteria1)

    #get submissions
    submissions1 = requests.get(url1).json()['data']

    #get vectors for submissions on subreddit1
    vecs=[]
    for submission in submissions1:

        #get title
        title=submission['title']

        #make lower case for searching vector dict
        title=title.lower()

        #tokenize title
        tokens=word_tokenize(title)


        #get vectors
        title_vecs=[glove_vecs[word] for word in tokens if word in glove_vecs]

        #make sure at least one word was in the glove vector set we're using
        if len(title_vecs)<2:
            continue

        #get mean
        title_mean=np.mean(title_vecs,axis=0)

        vecs.append(title_mean)
        

    subreddit_vecs.append(np.mean(vecs,axis=0))
    
    

Reduce to 2D and visualize:

In [None]:
#Dimensionality reduction with UMAP (https://umap-learn.readthedocs.io/en/latest/basic_usage.html)
vecs2d = umap.UMAP(n_neighbors=3,
                      min_dist=.1,
                      metric='euclidean').fit_transform(subreddit_vecs)

#plot each word based on 2d coordinates
for ind,coords in enumerate(vecs2d):

    subreddit=subreddits[ind]
    
    x=coords[0]
    y=coords[1]

    plt.text(x,y,subreddit,fontsize=8)

#set x and y limits
plt.xlim(np.min([v[0] for v in vecs2d])-1,np.max([v[0] for v in vecs2d])+1)
plt.ylim(np.min([v[1] for v in vecs2d])-1,np.max([v[1] for v in vecs2d])+1)

plt.show()

How to make your own GloVe vectors?

In [105]:
#Go Here: https://nlp.stanford.edu/projects/glove/