In [52]:
import gensim 
import pandas as pd

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline


In [53]:
mobility_transitions = pd.read_csv('/Users/dakotamurray/Dropbox/SME-dropbox/Data/Derived/mobility_sentences/institution_sentences_2019to2019.csv')
mobility_transitions.head()

Unnamed: 0,cluster_id,inst_sentence
0,12,1292 1292 257
1,39,3047 846 3047 846 3047 846
2,41,100189 15929 3962 100104 3961
3,50,1247 10200
4,67,2625 19001 19001 2625 10158


In [54]:
# Tokenize the sentences into a list of lists
mobility_list = []
for sentence in mobility_transitions.inst_sentence:
    mobility_list.append(sentence.split(' '))
    
mobility_list[0:5]

[['1292', '1292', '257'],
 ['3047', '846', '3047', '846', '3047', '846'],
 ['100189', '15929', '3962', '100104', '3961'],
 ['1247', '10200'],
 ['2625', '19001', '19001', '2625', '10158']]

In [55]:
# Now go about training the gensim model
# build vocabulary and train model
model = gensim.models.Word2Vec(
            mobility_list,
            size = 80,
            window = 10, # just use the entire sentence
            min_count = 20, # Remove tokens that don't appear enough
            workers = 4, # paralellize, use 4 workers
            iter = 50
) # end model

In [56]:
# Most similar to IUB
model.wv.most_similar('1179')

[('10377', 0.9343024492263794),
 ('1223', 0.8040107488632202),
 ('30001', 0.41414177417755127),
 ('10366', 0.39374053478240967),
 ('3477', 0.3867662847042084),
 ('1306', 0.3768657445907593),
 ('30008', 0.37480348348617554),
 ('15906', 0.36990493535995483),
 ('10395', 0.36608612537384033),
 ('10372', 0.3632366359233856)]

In [57]:
def tsne_coords(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=10, n_components=2, init='pca', n_iter=2000, random_state=23, verbose = True)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    df = pd.DataFrame(x, y)
    return(df)
    
tsne_coords_df = tsne_coords(model)

  import sys


[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 7126 samples in 0.013s...
[t-SNE] Computed neighbors for 7126 samples in 9.899s...
[t-SNE] Computed conditional probabilities for sample 1000 / 7126
[t-SNE] Computed conditional probabilities for sample 2000 / 7126
[t-SNE] Computed conditional probabilities for sample 3000 / 7126
[t-SNE] Computed conditional probabilities for sample 4000 / 7126
[t-SNE] Computed conditional probabilities for sample 5000 / 7126
[t-SNE] Computed conditional probabilities for sample 6000 / 7126
[t-SNE] Computed conditional probabilities for sample 7000 / 7126
[t-SNE] Computed conditional probabilities for sample 7126 / 7126
[t-SNE] Mean sigma: 1.799123
[t-SNE] KL divergence after 250 iterations with early exaggeration: 101.690102
[t-SNE] KL divergence after 2000 iterations: 2.918664


In [58]:
from sklearn.decomposition import PCA

labels = []
tokens = []

for word in model.wv.vocab:
    tokens.append(model[word])
    labels.append(word)

pca = PCA(n_components=2)
components = pca.fit_transform(tokens)
pca_coords_df = pd.DataFrame(data = components, columns = ['x', 'y'])

  import sys


In [None]:
import umap
reducer = umap.UMAP()
embedding = reducer.fit_transform(tokens)
umap_coords_df = pd.DataFrame(embedding, columns = ['x', 'y'])

In [67]:
labels = [word for word in model.wv.vocab]
tsne_coords_df['token'] = labels
pca_coords_df['token'] = labels
umap_coords_df['token'] = labels

In [68]:
tsne_coords_df.to_csv('/Users/dakotamurray/Dropbox/SME-dropbox/Data/Derived/dim_reduced/inst_tsne.csv')
umap_coords_df.to_csv('/Users/dakotamurray/Dropbox/SME-dropbox/Data/Derived/dim_reduced/inst_umap.csv')
pca_coords_df.to_csv('/Users/dakotamurray/Dropbox/SME-dropbox/Data/Derived/dim_reduced/inst_pca.csv')