In [None]:
%config InlineBackend.figure_formats = ['svg']
import json
import numpy as np
import matplotlib.pyplot as plt
import spacy
import pandas as pd
import matplotlib.cm as cm
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from collections import defaultdict
from random import shuffle, sample
from matplotlib.patches import Patch

In [None]:
labels_file = 'labels.json'
model = 'en_core_web_lg'
corpus = ('../OHCumulativeAug10.csv', '../MOCumulativeAug10.csv', '../WICumulativeAug10.csv', '../mi_all_subs_pseudo_cois.csv')

In [None]:
nlp = spacy.load(model)

In [None]:
labels = json.load(open(labels_file))
del labels['Personal-unusable-incoherent']
del labels['Named neighborhood']

In [None]:
cat_indices = {cat: idx for idx, cat in enumerate(labels)}

In [None]:
n_per_cat = 12

In [None]:
label_docs = defaultdict(list)
docs_with_cats = []
seen = set()

for cat, keywords in labels.items():
  for keyword in sample(keywords, min(n_per_cat, len(keywords))) + [cat]:
    normalized_keyword = keyword.lower().strip()
    if normalized_keyword not in seen:
      doc = nlp(normalized_keyword)
      if doc.vector_norm:
        label_docs[cat].append((doc, cat))
    seen.add(normalized_keyword)
  docs_with_cats += label_docs[cat]
docs = [dc[0] for dc in docs_with_cats]
doc_cats = [dc[1] for dc in docs_with_cats]
n = len(docs)

In [None]:
n

In [None]:
dists = np.zeros((n, n))
for ii, outer_doc in enumerate(docs[:n]):
  for jj, inner_doc in enumerate(docs[:n]):
    dists[ii, jj] = np.arccos(inner_doc.similarity(outer_doc))
dists[np.isnan(dists)] = 1
X_pca = PCA(n_components=20).fit_transform(dists)
X = TSNE().fit_transform(X_pca)

In [None]:
fig, ax = plt.subplots(figsize=(50, 35))
cmap = cm.get_cmap('jet', len(labels))
ax.scatter(X[:, 0], X[:, 1], color=cmap([cat_indices[c] for c in doc_cats]))
legend = [Patch(facecolor=cmap(idx), label=c) for c, idx in cat_indices.items()]
  
for x, y, doc in zip(X[:, 0], X[:, 1], docs):
   ax.annotate(
     doc.text,
     (x, y),
     textcoords="offset points",
     xytext=(0,10),
     ha='center'
   )
ax.legend(handles=legend, loc='upper left')
ax.axis('off')
plt.savefig('moon_keywords_moon_clusters_top_12_per_cluster_run_3.pdf', bbox_inches='tight')
plt.show()

In [None]:
n_clusters = 20
labels = KMeans(n_clusters=n_clusters).fit(X_pca).labels_
cmap = cm.get_cmap('jet', n_clusters)
fig, ax = plt.subplots(figsize=(50, 35))
ax.scatter(X[:, 0], X[:, 1], color=cmap(labels))
  
for x, y, doc in zip(X[:, 0], X[:, 1], docs):
   ax.annotate(
     doc.text,
     (x, y),
     textcoords="offset points",
     xytext=(0,10),
     ha='center'
   )
ax.axis('off')
plt.savefig(f'moon_keywords_kmeans{n_clusters}_clusters_run_5.pdf', bbox_inches='tight')
plt.show()