# Topic Modeling using K-Means

### References

* Data: Drug Dataset (400EA)
* Preprocess: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
* K-Means: https://lovit.github.io/nlp/2018/09/27/pyldavis_kmeans/#topic=0&lambda=1&term=

### Load Raw Data

In [99]:
import pickle

with open('./drug_tables.pkl', 'rb') as f:
    saved_model = pickle.load(f)



In [9]:
processed_docs = saved_model['processed_docs']

----

### T-SNE

* https://datascienceschool.net/view-notebook/3e7aadbf88ed4f0d87a76f9ddc925d69/
* https://lumiamitie.github.io/r/python/tsne-for-r-py/

In [10]:
### TSNE모델에는 transform 메소드가 없고 fit_transform만 있음
# library import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

np.random.seed(2018)

In [19]:
processed_docs = processed_docs.apply(lambda x: x[1:-1].replace("'", "").split(', '))

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
%time vect.fit([' '.join(d) for d in processed_docs])
%time tsne_data = vect.transform([' '.join(d) for d in processed_docs]).toarray()

CPU times: user 1.02 s, sys: 19.7 ms, total: 1.04 s
Wall time: 1.04 s
CPU times: user 988 ms, sys: 524 ms, total: 1.51 s
Wall time: 1.52 s


In [21]:
tsne_data.shape

(27960, 9699)

----

### K-Means

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans

In [24]:
X = normalize(tsne_data, norm='l2')

In [25]:
kmeans_model = KMeans(n_clusters=8, init="random", max_iter=30000).fit(X)

In [100]:
kmeans_model.cluster_centers_.shape

(8, 9699)

In [101]:
saved_model['kmeans_result'] = kmeans_model

In [69]:
with open('./grace_drug_table.pkl', 'wb') as f:
    pickle.dump(saved_model, f)

In [68]:
saved_model['kmeans_result'].cluster_centers_.shape

(8, 9699)

In [32]:
bow = normalize(tsne_data, norm='l2')

In [34]:
from collections import Counter

n_clusters = saved_model['kmmeans_result'].cluster_centers_.shape[0]
n_docs, n_terms = bow.shape

cluster_size = Counter(saved_model['kmmeans_result'].labels_)
cluster_size = np.asarray([cluster_size.get(c, 0) for c in range(n_clusters)])

term_frequency = np.asarray(bow.sum(axis=0)).reshape(-1)
term_frequency[np.where(term_frequency == 0)[0]] = 0.01

In [35]:
n_clusters, n_docs, n_terms

(8, 27960, 9699)

In [37]:
weighted_centers = np.zeros((n_clusters, n_terms))
for c, n_docs in enumerate(cluster_size):
    weighted_centers[c] = saved_model['kmmeans_result'].cluster_centers_[c] * n_docs

In [70]:
with open('./grace_drug_table.pkl', 'rb') as f:
    saved_model_2 = pickle.load(f)

In [50]:
f.close()

In [102]:
dodo = saved_model['documents']

In [103]:
dodo = dodo.reset_index()
dodo.columns = ['id', 'document']
dodo.head()

Unnamed: 0,id,document
0,0,Analysis of efficacy
1,1,Comparisons of postoperative CA19-9 levels on survival of ESPAC-4 with the CONOKO-01 and JASPAC-1 trials
2,2,Pattern of disease relapse
3,3,Grade 1–5 adverse events with gemcitabine alone and gemcitabine plus capecitabine
4,4,Treatment with zoledronic acid


In [104]:
saved_model['documents'] = dodo

In [106]:
with open('./grace_drug_table.pkl', 'wb') as f:
    pickle.dump(saved_model, f)

In [105]:
saved_model.keys()

dict_keys(['dec_result', 'documents', 'processed_docs', 'lda_result', 'kmeans_result'])