# Topic Modeling using K-Means

### References

* Data: ABC News Headlines (https://www.kaggle.com/therohk/million-headlines/version/6)
* Preprocess: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
* K-Means: https://lovit.github.io/nlp/2018/09/27/pyldavis_kmeans/#topic=0&lambda=1&term=

### Load Raw Data

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', 999)
news_data = pd.read_csv('./mallet_top_sen.tsv', sep='\t')

In [2]:
news_data.head()

Unnamed: 0.1,Unnamed: 0,id,Topic_Num,Topic_Perc_Contribu,Topic_Keywords,Origin_Text,Text
0,0,44029,0.0,0.2935,"analysi, multivari, regress, variabl, model, predictor, cardiac, time, univari, heart",Hazard Ratio (and 95% Confidence Intervals) in Univariate and Multivariate Analysis of Predictors of Major Cardiac Events (Cardiac Death or Worsening of Heart Failure Leading to Heart Transplantation),"['hazard', 'ratio', 'confid', 'interv', 'univari', 'multivari', 'analysi', 'predictor', 'major', 'cardiac', 'event', 'cardiac', 'death', 'worsen', 'heart', 'failur', 'lead', 'heart', 'transplant']"
1,1,23344,0.0,0.2836,"analysi, multivari, regress, variabl, model, predictor, cardiac, time, univari, heart","Left Ventricular and Right Ventricular Ejection Fractions, Left Ventricular and Right Ventricular Mean Phases, Left-to-Right Mean Phase Difference (L-RMP) and Phase Standard Deviations for Both Ventricles in 30 Cases of Left Sided WPW legend","['leav', 'ventricular', 'right', 'ventricular', 'eject', 'fraction', 'leav', 'ventricular', 'right', 'ventricular', 'mean', 'phase', 'leav', 'right', 'mean', 'phase', 'differ', 'rmp', 'phase', 'standard', 'deviat', 'ventricl', 'case', 'leav', 'side', 'wpw']"
2,2,41163,0.0,0.2817,"analysi, multivari, regress, variabl, model, predictor, cardiac, time, univari, heart","Partial Regression Coefficients (All Subjects, n = 262) for Forward Stepwise Linear Regression for Dependent Variables Augmentation Pressure and Augmentation Index legend","['partial', 'regress', 'coeffici', 'subject', 'forward', 'stepwis', 'linear', 'regress', 'depend', 'variabl', 'augment', 'pressur', 'augment', 'index']"
3,3,23343,0.0,0.2797,"analysi, multivari, regress, variabl, model, predictor, cardiac, time, univari, heart","Left Ventricular (LVEF) and Right Ventricular (RVEF) Ejection Fractions, Left Ventricular (LVMP) and Right Ventricular (RVMP) Mean Phases, Left-to-Right Mean Phase Difference (L-RMP) and Phase Standard Deviations (LVPSD and RVPSD) for Both Ventricles in 14 Cases of Right Sided WPW legend","['leav', 'ventricular', 'lvef', 'right', 'ventricular', 'rvef', 'eject', 'fraction', 'leav', 'ventricular', 'lvmp', 'right', 'ventricular', 'rvmp', 'mean', 'phase', 'leav', 'right', 'mean', 'phase', 'differ', 'rmp', 'phase', 'standard', 'deviat', 'lvpsd', 'rvpsd', 'ventricl', 'case', 'right', 'side', 'wpw']"
4,4,24968,0.0,0.2782,"analysi, multivari, regress, variabl, model, predictor, cardiac, time, univari, heart",Predictors of Mortality by Multivariable Analysis: Variables Are Shown in the Order They Entered a Stepwise Cox Regression Model,"['predictor', 'mortal', 'multivari', 'analysi', 'variabl', 'show', 'order', 'enter', 'stepwis', 'cox', 'regress', 'model']"


#### Extract target data

In [3]:
data_text = news_data[['Origin_Text']]
data_text['index'] = news_data[['Unnamed: 0']]
documents = data_text
documents.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Origin_Text,index
0,Hazard Ratio (and 95% Confidence Intervals) in Univariate and Multivariate Analysis of Predictors of Major Cardiac Events (Cardiac Death or Worsening of Heart Failure Leading to Heart Transplantation),0
1,"Left Ventricular and Right Ventricular Ejection Fractions, Left Ventricular and Right Ventricular Mean Phases, Left-to-Right Mean Phase Difference (L-RMP) and Phase Standard Deviations for Both Ventricles in 30 Cases of Left Sided WPW legend",1
2,"Partial Regression Coefficients (All Subjects, n = 262) for Forward Stepwise Linear Regression for Dependent Variables Augmentation Pressure and Augmentation Index legend",2
3,"Left Ventricular (LVEF) and Right Ventricular (RVEF) Ejection Fractions, Left Ventricular (LVMP) and Right Ventricular (RVMP) Mean Phases, Left-to-Right Mean Phase Difference (L-RMP) and Phase Standard Deviations (LVPSD and RVPSD) for Both Ventricles in 14 Cases of Right Sided WPW legend",3
4,Predictors of Mortality by Multivariable Analysis: Variables Are Shown in the Order They Entered a Stepwise Cox Regression Model,4


----

### Preprocessing

* Import Libraries

In [25]:
!pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org gensim

[33mYou are using pip version 9.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [26]:
!pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org nltk

[33mYou are using pip version 9.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [27]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import numpy as np
np.random.seed(2018)

import nltk
nltk.download('wordnet')

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gracelee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

* Preprocess
 1. simple_preprocess: Split Text by whitespace
 2. STOPWORDS: Remove stopwords
 3. lemmatize_stemming
 
* lemmatize_stemming
 - Lemmatizing & Stemming Replace word with original form
 - Lemmatizing consider whether the word exist in the real world
 - pos means a position of the word
 - https://m.blog.naver.com/PostView.nhn?blogId=vangarang&logNo=220963244354&proxyReferer=https%3A%2F%2Fwww.google.com%2F

In [28]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

* Test

In [29]:
doc_sample = documents[documents['index'] == 100].values[0][0]
print('original document: ')

words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)

print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Treatment', 'efficacy', 'at', 'week', '36', 'for', 'the', 'modified', 'intention-to-treat', 'population', 'in', 'the', 'open-label', 'period', 'and', 'at', 'week', '88', 'for', 'the', 'modified', 'intention-to-treat', 'subpopulations', 'in', 'the', 'double-blind', 'period']


 tokenized and lemmatized document: 
['treatment', 'efficaci', 'week', 'modifi', 'intent', 'treat', 'popul', 'open', 'label', 'period', 'week', 'modifi', 'intent', 'treat', 'subpopul', 'doubl', 'blind', 'period']


* Run

In [30]:
%time processed_docs = documents['Origin_Text'].map(preprocess)
processed_docs[:10]

CPU times: user 613 ms, sys: 2.88 ms, total: 616 ms
Wall time: 620 ms


0                                                                                        [hazard, ratio, confid, interv, univari, multivari, analysi, predictor, major, cardiac, event, cardiac, death, worsen, heart, failur, lead, heart, transplant]
1                                           [leav, ventricular, right, ventricular, eject, fraction, leav, ventricular, right, ventricular, mean, phase, leav, right, mean, phase, differ, phase, standard, deviat, ventricl, case, leav, side, legend]
2                                                                                                                   [partial, regress, coeffici, subject, forward, stepwis, linear, regress, depend, variabl, augment, pressur, augment, index, legend]
3    [leav, ventricular, lvef, right, ventricular, rvef, eject, fraction, leav, ventricular, lvmp, right, ventricular, rvmp, mean, phase, leav, right, mean, phase, differ, phase, standard, deviat, lvpsd, rvpsd, ventricl, case, right, side, legend]
4       

----

### T-SNE

* https://datascienceschool.net/view-notebook/3e7aadbf88ed4f0d87a76f9ddc925d69/
* https://lumiamitie.github.io/r/python/tsne-for-r-py/

In [33]:
### TSNE모델에는 transform 메소드가 없고 fit_transform만 있음
# library import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

np.random.seed(2018)

In [34]:
type(documents['Origin_Text'].values.tolist())

list

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
%time vect.fit(documents['Origin_Text'].values.tolist())

CPU times: user 38.1 ms, sys: 1.96 ms, total: 40.1 ms
Wall time: 39.8 ms


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [36]:
%time tsne_data = vect.transform(documents['Origin_Text'].values.tolist()).toarray()

CPU times: user 31.5 ms, sys: 5.96 ms, total: 37.4 ms
Wall time: 37.1 ms


In [37]:
tsne_data[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [38]:
%time tsne_result = TSNE(learning_rate=300, init='pca').fit_transform(np.array(tsne_data))

CPU times: user 16.5 s, sys: 957 ms, total: 17.4 s
Wall time: 17.3 s


In [39]:
tsne_result[:10]

array([[ -5.5887737,  15.092761 ],
       [-26.751963 ,  -1.2962534],
       [ -7.7378716,   5.3128357],
       [-26.751284 ,  -1.296099 ],
       [ -5.5161123,   9.251663 ],
       [ -0.8104641,  16.026785 ],
       [ -4.9360843,  10.798    ],
       [ -7.008701 ,  11.168139 ],
       [-10.108216 ,   8.083006 ],
       [ -5.114168 ,  13.3855915]], dtype=float32)

----

### K-Means

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans

In [5]:
docs = list(documents.Origin_Text.values)

In [6]:
# vectorizing
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)

# L2 normalizing
X = normalize(X, norm='l2')

In [56]:
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
kmeans_model = KMeans(n_clusters=8, init="random", max_iter=3000).fit(X)

In [8]:
# trained labels and cluster centers
labels = kmeans_model.labels_
centers = kmeans_model.cluster_centers_

----

### K-Means 결과를 LDAvis로 시각화
* https://lovit.github.io/nlp/2018/09/27/pyldavis_kmeans/#topic=0&lambda=1&term=

* topic_term_dists
* doc_topic_dists
* doc_lengths
* vocab
* term_frequency

In [9]:
import numpy as np

doc_lengths = np.asarray(X.sum(axis=1)).reshape(-1)
term_frequency = np.asarray(X.sum(axis=0)).reshape(-1)

In [10]:
docwords = [doc.split(' ') for doc in docs]
vocab = list(set(word for document in docwords for word in document))

In [15]:
from kmeans_to_pyLDAvis import kmeans_to_prepared_data

vis_data = kmeans_to_prepared_data(
    X,
    vocab[1:],
    centers,
    labels,
    n_printed_words = 10,
    radius = 5
)

In [12]:
import pyLDAvis

pyLDAvis.save_html(vis_data, 'kmeans_pyLDAvis.html')

In [24]:
vis_data.token_table.head()

Unnamed: 0,term,Topic,Freq,Term
0,1260,1,0.084599,Echocardiographic
1,514,1,0.930829,followed-up
2,1447,1,0.953266,Serum-antibody
3,1531,1,0.549004,"smear-positive,"
4,787,1,0.121786,"Size,"


----

### Visualization

#### 1. HBar Chart

In [17]:
import json

hbar_json = {}
hbar_json['labels'] = vis_data.topic_info.Category.unique().tolist()
hbar_json['max_width'] = vis_data.topic_info[vis_data.topic_info.Category != 'Default'][['Total']].max()[0]
for l in vis_data.topic_info.Category.unique().tolist():
    tmp_df = vis_data.topic_info[vis_data.topic_info.Category == l].sort_values(['Category', 'Freq'], ascending=[True, False]).groupby('Category').head()
    sub_json = {}

    hbar_json[l] = list(tmp_df[['Term', 'Freq', 'Total']].sort_values('Freq', ascending=False).reset_index().to_dict('index').values())
    
f = open('./km/hbar_data.json', 'w')
f.write(json.dumps(hbar_json, indent=4))
f.close()

#### 2. Scatter Chart

In [40]:
doc_result = documents[['index', 'Origin_Text']]
doc_result.columns = ['id', 'document']
doc_result['topic'] = kmeans_model.labels_
doc_result = pd.merge(doc_result, pd.DataFrame(tsne_result, columns=['plot_x', 'plot_y']), left_index=True, right_index=True)

doc_result.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,document,topic,plot_x,plot_y
0,0,Hazard Ratio (and 95% Confidence Intervals) in Univariate and Multivariate Analysis of Predictors of Major Cardiac Events (Cardiac Death or Worsening of Heart Failure Leading to Heart Transplantation),7,-5.588774,15.092761
1,1,"Left Ventricular and Right Ventricular Ejection Fractions, Left Ventricular and Right Ventricular Mean Phases, Left-to-Right Mean Phase Difference (L-RMP) and Phase Standard Deviations for Both Ventricles in 30 Cases of Left Sided WPW legend",2,-26.751963,-1.296253
2,2,"Partial Regression Coefficients (All Subjects, n = 262) for Forward Stepwise Linear Regression for Dependent Variables Augmentation Pressure and Augmentation Index legend",1,-7.737872,5.312836
3,3,"Left Ventricular (LVEF) and Right Ventricular (RVEF) Ejection Fractions, Left Ventricular (LVMP) and Right Ventricular (RVMP) Mean Phases, Left-to-Right Mean Phase Difference (L-RMP) and Phase Standard Deviations (LVPSD and RVPSD) for Both Ventricles in 14 Cases of Right Sided WPW legend",2,-26.751284,-1.296099
4,4,Predictors of Mortality by Multivariable Analysis: Variables Are Shown in the Order They Entered a Stepwise Cox Regression Model,0,-5.516112,9.251663


In [41]:
scatter_json = list(doc_result[['id', 'plot_x', 'plot_y', 'topic']].to_dict('index').values())

f = open('./km/scatter_data.json', 'w')
f.write(json.dumps(scatter_json, indent=4))
f.close()

#### 3. Table

In [187]:
import collections

doc_result['topic_word'] = processed_docs
doc_result['words_count'] = doc_result.words.apply(lambda x : len(x))

topic_words = {}
for i in doc_result.topic.unique():
    topic_words[i] = sorted({k: '%.2f' % (dict(collections.Counter(np.sum(doc_result[doc_result.topic == i].topic_word.values)))[k] / doc_result[doc_result.topic == i].words_count.sum() * 100) \
    for k in dict(collections.Counter(np.sum(doc_result[doc_result.topic == i].topic_word.values)))}.items(), key=lambda kv: kv[1], reverse=True)[:10]
    
doc_result.topic_word = doc_result.apply(lambda x: topic_words[x.topic], axis=1)

In [191]:
doc_result.to_csv('km.tsv', sep='\t', index_label=False)

In [188]:
doc_result.groupby('topic').head(1)[['topic', 'topic_word']]

Unnamed: 0,topic,topic_word
0,0,"[(year, 3.19), (patient, 3.06), (risk, 2.64), (mortal, 2.36), (cancer, 2.08), (relat, 1.94), (specif, 1.81), (number, 1.81), (score, 1.81), (ratio, 1.81)]"
1,2,"[(death, 7.22), (year, 6.67), (rat, 5.56), (ventricular, 4.44), (standardis, 3.89), (leav, 3.89), (right, 3.89), (caus, 3.89), (health, 3.33), (combin, 3.33)]"
2,3,"[(patient, 2.58), (risk, 1.50), (model, 1.29), (death, 1.29), (analysi, 1.20), (regress, 1.20), (hazard, 1.12), (accord, 1.07), (score, 1.03), (coronari, 1.03)]"
8,7,"[(seri, 6.67), (physic, 6.67), (origin, 6.67), (imag, 6.67), (activ, 6.67), (compress, 6.67), (effect, 6.67), (intervent, 6.67), (stroke, 5.56), (legend, 4.44)]"
101,1,"[(grade, 6.22), (patient, 5.78), (popul, 5.11), (emerg, 4.89), (group, 4.67), (safeti, 4.67), (relat, 3.56), (studi, 2.89), (occur, 2.89), (drug, 2.67)]"
128,6,"[(patient, 9.41), (coronari, 6.47), (year, 5.29), (myocardi, 4.12), (infarct, 4.12), (surviv, 3.53), (acut, 3.53), (random, 3.53), (hospit, 2.94), (failur, 2.94)]"
154,4,"[(legend, 7.50), (effect, 7.50), (cardiac, 2.50), (renin, 2.50), (respons, 2.50), (administ, 2.50), (interv, 2.50), (plasma, 2.50), (rate, 2.50), (myocardi, 2.50)]"
325,5,"[(chest, 5.00), (arteri, 5.00), (ctca, 5.00), (pain, 5.00), (index, 5.00), (myocardi, 5.00), (hospit, 5.00), (patient, 5.00), (accuraci, 5.00), (influenc, 5.00)]"


In [189]:
doc_result.groupby('topic').agg({'id': 'unique'})

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
0,"[0, 5, 21, 37, 51, 52, 54, 55, 59, 63, 64, 69, 73, 77, 81, 83, 85, 87, 91, 94, 96, 98, 99, 151, 169, 176, 178, 182, 200, 201, 204, 206, 208, 209, 210, 212, 213, 215, 216, 220, 221, 223, 225, 230, 231, 232, 241, 255, 267, 271, 279, 280, 282, 296, 314, 318, 322, 326, 337, 345, 349, 351, 352, 355, 356, 357, 358, 360, 361, 374, 376, 383]"
1,"[101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 141, 142, 143, 144, 145, 146, 147, 148, 149]"
2,"[1, 3, 56, 57, 202, 207, 211, 222, 227, 229, 237, 238, 245, 247, 272, 278, 304, 327]"
3,"[2, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 53, 58, 60, 61, 62, 65, 66, 67, 68, 70, 71, 72, 74, 75, 76, 78, 79, 80, 82, 84, 86, 88, 89, 90, 92, 93, 95, 97, 114, 115, 150, 152, 153, 157, 159, 160, 162, 163, 164, 165, 168, 170, 171, 172, 173, 174, 175, 177, 179, 185, 186, 187, 188, 189, 190, 191, 192, ...]"
4,"[154, 155, 156, 161]"
5,"[325, 342]"
6,"[128, 140, 158, 166, 167, 180, 181, 183, 184, 197, 203, 205, 236, 307, 310, 331, 375]"
7,"[8, 50, 100, 261, 291, 311, 312, 321, 373]"
