In [9]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from gensim.models import FastText
import numpy as np
import plotly.graph_objects as go
from pathlib import Path

# NOTE: A good chunk of the code in this NB is reused from A4.

def tsne_plot_similar_words(labels, embedding_clusters, word_clusters, a=0.7) -> go.Figure:
    fig_data = []
    for label, embeddings, words in zip(labels, embedding_clusters, word_clusters):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        fig_data.append(go.Scatter(x=x, y=y, mode='markers', marker=dict(size=8, opacity=a),
                                   text=words, hoverinfo='text+name', name=label))
    layout = go.Layout(showlegend=True, legend=dict(orientation="h", y=-0.2), plot_bgcolor="rgba(0,0,0,0)",
                       paper_bgcolor="rgba(0,0,0,0)", title='handicap 2020 similar words')
    fig = go.Figure(data=fig_data, layout=layout)
    return fig

In [10]:
term = 'disab'
decade = 1990

print(Path.cwd())
script_loc = Path.cwd()
data_path = script_loc.parent/'data'
json_path = data_path / term / f'{decade}s' / 'json'

model = FastText.load(str(json_path/'fasttext_model.bin'))

/home/noah/Documents/disability_trends/code/embed


In [11]:
keys = ['disability', 'neurodiversity', 'autism', 'spectrum', 'handicap', 'golf']
embedding_clusters = []
word_clusters = []

for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in model.wv.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])
    embedding_clusters.append(embeddings)#apending access vector of all similar words
    word_clusters.append(words)#appending list of all smiliar words

In [12]:
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape #geting the dimensions
tsne_model_en_2d = TSNE(perplexity=5, n_components=2, init='pca', max_iter=1500, random_state=2020)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2) #reshaping it into 2d so we can visualize it

In [13]:
tsne_plot_similar_words(keys, embeddings_en_2d, word_clusters, 0.7).show()

In [14]:
from sklearn.cluster import KMeans
import plotly.express as px

temp = embeddings_en_2d.reshape(-1, embeddings_en_2d.shape[-1])
kmeans = KMeans(n_clusters=4, random_state=0).fit(temp)

preds = kmeans.predict(temp)

# km_plot = px.scatter(x=temp[:,0], y=temp[:,1], color=[str(x) for x in kmeans.predict(temp)])
# km_plot.show()

km_plot = go.Figure()

for i in range(4):
    km_plot.add_trace(go.Scatter(x=[kmeans.cluster_centers_[i, 0]], y=[kmeans.cluster_centers_[i, 1]],
                             mode='markers',
                             marker=dict(size=10, color=i), name=f'KMeans cluster centroid {i + 1}'))
km_plot.add_trace(go.Scatter(x=temp[:,0], y=temp[:,1], mode='markers', marker=dict(color=preds), name='Predicted clusters'))

km_plot.show()
# print(embeddings_en_2d.reshape(-1, embeddings_en_2d.shape[-1]).shape)