In [None]:
%config InlineBackend.figure_formats = ['svg']
import numpy as np
import matplotlib.pyplot as plt
import spacy
import pandas as pd
import matplotlib.cm as cm
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from collections import defaultdict

In [None]:
chunks_file = '../keyword_analysis/wi_mi_mo_oh_union_accepted.csv'
model = 'en_core_web_lg'

In [None]:
nlp = spacy.load(model)

In [None]:
df = pd.read_csv(chunks_file)

In [None]:
filter_chunks = ['health education institution', 'state medium market', 'class neighborhood']  # these should have been filtered out earlier

In [None]:
all_docs = [nlp(doc) for doc in df['chunk']]
valid_docs = [doc for doc in all_docs if doc.vector_norm and doc.text not in filter_chunks]
n = 150 #len(valid_docs)

In [None]:
dists = np.zeros((n, n))

In [None]:
for ii, outer_doc in enumerate(valid_docs[:n]):
  for jj, inner_doc in enumerate(valid_docs[:n]):
    dists[ii, jj] = np.arccos(inner_doc.similarity(outer_doc))

In [None]:
X_pca = PCA(n_components=20).fit_transform(dists)
#X = TSNE().fit_transform(X_pca)
X = TSNE().fit_transform(dists)

In [None]:
cmap = cm.get_cmap(name='tab20')
labels = KMeans(n_clusters=14, random_state=0).fit(X_pca).labels_

In [None]:
chunks_by_label = defaultdict(list)
for doc, label in zip(valid_docs, labels):
  chunks_by_label[label].append(doc.text)

In [None]:
for _, chunks in chunks_by_label.items():
  print('*', ', '.join(chunks))

In [None]:
fig, ax = plt.subplots(figsize=(23, 23))
ax.scatter(X[:, 0], X[:, 1], color=cmap(labels))

for x, y, doc in zip(X[:, 0], X[:, 1], valid_docs):
   ax.annotate(
     doc.text,
     (x, y),
     textcoords="offset points",
     xytext=(0,10),
     ha='center'
   )

In [None]:
fig, ax = plt.subplots()
ax.scatter(X[:, 0], X[:, 1])
plt.show()

In [None]:
fig, ax = plt.subplots()

ax.scatter(X[:, 0], X[:, 1])
plt.show()

In [None]:
plt.scatter(X[:, 0], X[:, 1], color=cmap(labels))
plt.show()