In [4]:
import pickle
import pandas as pd
import numpy as np

----

### Load model results

In [5]:
with open('./colab_results/reuters/lda_result.pkl', 'rb') as f:
    lda_results = pickle.load(f)
f.close()    
    
document_topic_counts, topic_word_counts, topic_counts, document_lengths, distinct_words = lda_results

In [6]:
with open('./colab_results/reuters/kmeans_result.pkl', 'rb') as f:
    km_results = pickle.load(f)
f.close()    
    
km_model = km_results

In [7]:
with open('./colab_results/reuters/dec_result.pkl', 'rb') as f:
    dec_results = pickle.load(f)
f.close()    
    
dec_model = dec_results

In [8]:
documents = pd.read_csv('./colab_results/reuters/reuter21578.tsv', sep='\t', usecols=['text', 'new_topic'])
processed_docs = pd.read_csv('./colab_results/reuters/preprocessed_docs.tsv', sep='\t', header=None)

In [9]:
documents = documents.reset_index()
documents.columns = ['id', 'document', 'new_topic']

In [10]:
documents.head()

Unnamed: 0,id,document,new_topic
0,0,Showers continued throughout the week in the B...,Commodity
1,1,The U.S. Agriculture Department reported the f...,Commodity
2,2,Argentine grain board figures show crop regist...,Commodity
3,3,Champion Products Inc said its board of direct...,Corporate
4,4,Computer Terminal Systems Inc said it has comp...,Corporate


In [11]:
processed_docs[1].head()

0    ['shower', 'continu', 'week', 'bahia', 'cocoa'...
1    ['agricultur', 'depart', 'report', 'farmer', '...
2    ['argentin', 'grain', 'board', 'figur', 'crop'...
3    ['champion', 'product', 'say', 'board', 'direc...
4    ['termin', 'system', 'say', 'complet', 'sale',...
Name: 1, dtype: object

In [14]:
topic_map = dict.fromkeys(documents.new_topic.unique().tolist(), 0)

for i, k in enumerate(topic_map):
    topic_map[k] = i
    
label_result = np.array(documents.new_topic.apply(lambda x: topic_map[x]).tolist())

In [15]:
topic_map

{'Commodity': 2,
 'Corporate': 1,
 'Currency': 4,
 'Economic': 3,
 'Energy': 0,
 'Subject': 5}

----

### Create model file

In [16]:
saved_model = {}

In [17]:
saved_model['documents'] = documents
saved_model['processed_docs'] = processed_docs[1]

* postprocess for lda

In [240]:
from sklearn.preprocessing import normalize

In [161]:
topic_term_dists = np.array([topic_word_counts[i][k] for i in range(6) for k in list(distinct_words)]).reshape((6, len(distinct_words))) 
doc_topic_dists = pd.DataFrame([d.values() for d in document_topic_counts]).fillna(0).values
doc_topic_dists = normalize(doc_topic_dists, norm='l1')
doc_lengths = np.array(document_lengths)
vocab = list(distinct_words)
term_frequency = np.array([topic_word_counts[i][k] for i in range(6) for k in list(distinct_words)]).reshape((6, len(distinct_words))).sum(axis=0)

In [27]:
saved_model['lda_result'] = {
        'document_topic_counts':document_topic_counts,
        'topic_word_counts':topic_word_counts,
        'topic_counts':topic_counts,
        'document_lengths':document_lengths,
        'distinct_words':distinct_words
}

In [26]:
saved_model['kmeans_result'] = {
        'cluster_centers' : km_model.cluster_centers_,
        'labels' : label_result
    }

In [19]:
saved_model['kmeans_result'] = km_model

In [28]:
saved_model['dec_result'] = dec_model

----

In [13]:
km_model.cluster_centers_

array([[-1.99899776e-19, -3.38813179e-20,  1.15196481e-19, ...,
        -1.18584613e-20,  4.57397792e-20,  4.65868121e-21],
       [ 1.62630326e-19,  5.84452734e-20,  1.25360876e-19, ...,
        -1.22819777e-20,  1.93123512e-19, -4.06575815e-20],
       [ 5.03544441e-05,  1.10724429e-05,  1.43995601e-19, ...,
         5.92498422e-06,  3.23660231e-05,  6.67493409e-06],
       [-9.82558219e-20, -2.28698896e-20,  9.99498878e-20, ...,
        -1.05879118e-20, -1.15196481e-19,  8.89384595e-21],
       [-3.04931861e-20, -1.01643954e-20,  1.20278679e-19, ...,
        -1.18584613e-20,  1.15196481e-19, -1.65171425e-20],
       [-8.60585474e-19,  2.48180654e-19,  4.67074692e-05, ...,
         1.66865491e-19, -5.81064602e-19, -1.07573184e-19]])

In [21]:
saved_model.keys()

dict_keys(['documents', 'processed_docs', 'lda_result', 'dec_result', 'kmeans_result'])

In [29]:
for k in saved_model.keys():
    if type(saved_model[k]) == dict:
        print("[%s] <class 'dict'>: %s(%s)" % (k, str(saved_model[k].keys()), [type(saved_model[k][t]) for t in saved_model[k].keys()]))
    else:
        print("[%s] %s" % (k, type(saved_model[k])))

[documents] <class 'pandas.core.frame.DataFrame'>
[processed_docs] <class 'pandas.core.series.Series'>
[lda_result] <class 'dict'>: dict_keys(['document_topic_counts', 'document_lengths', 'topic_counts', 'distinct_words', 'topic_word_counts'])([<class 'list'>, <class 'list'>, <class 'list'>, <class 'set'>, <class 'list'>])
[dec_result] <class 'numpy.ndarray'>
[kmeans_result] <class 'dict'>: dict_keys(['labels', 'cluster_centers'])([<class 'numpy.ndarray'>, <class 'numpy.ndarray'>])


In [69]:
with open('./colab_results/reuters/saved_model_reuters_no_km_with_label.pkl', 'wb') as f:
    pickle.dump(saved_model, f)
f.close()

In [84]:
import sklearn

In [85]:
type(saved_model['kmeans_result']) == sklearn.cluster.k_means_.KMeans

True

----

In [170]:
import pandas as pd
import numpy as np
import pyLDAvis
import pyLDAvis.gensim as genldavis
import sklearn
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [171]:
documents = saved_model['documents']
processed_docs = saved_model['processed_docs']

In [197]:
processed_docs = processed_docs.apply(lambda x: x[1:-1].replace("'", "").split(', '))

In [199]:
vect = CountVectorizer()
tsne_data = vect.fit_transform([' '.join(d) for d in processed_docs]).toarray()

In [200]:
kmeans_result = saved_model['kmeans_result']
if type(kmeans_result) == sklearn.cluster.k_means_.KMeans:
    kmeans_centers = kmeans_result.cluster_centers_
    kmeans_labels = kmeans_result.labels_
else:
    kmeans_centers = kmeans_result['cluster_centers']
    kmeans_labels = kmeans_result['labels']

In [201]:
x = normalize(tsne_data, norm='l2')

In [202]:
x.shape

(10377, 17217)

In [203]:
len(list(set(np.sum(processed_docs.values))))

17217

----

In [15]:
with open('./drug_tables.pkl', 'rb') as f:
    drug_tables = pickle.load(f)
f.close()



In [17]:
with open('./km_gracee_result.pkl', 'rb') as f:
    km_results = pickle.load(f)
f.close()    
    
km_model = km_results



In [20]:
drug_tables['kmeans_result'] = km_model

In [24]:
km_model.cluster_centers_.shape

(8, 8178)

In [22]:
with open('./grace_drug_table.pkl', 'wb') as f:
    pickle.dump(drug_tables, f)
f.close()

----

### UMAP

* https://www.analyticsvidhya.com/blog/2018/08/dimensionality-reduction-techniques-python/
* http://replet.tistory.com/67 (그냥 참고)

In [33]:
!pip install umap-learn

Collecting umap-learn
  Downloading https://files.pythonhosted.org/packages/4e/ce/33e260133f2a8e6c24a434e22de31f1dff01d58b3beec033d5c1b544bfb7/umap-learn-0.3.7.tar.gz (40kB)
[K    100% |████████████████████████████████| 40kB 566kB/s 
Collecting numba>=0.37 (from umap-learn)
  Downloading https://files.pythonhosted.org/packages/79/25/03ea2db69dfa3e2b42607afa106b54e29858f9921da9299abf447c484414/numba-0.41.0-cp35-cp35m-macosx_10_9_x86_64.whl (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 450kB/s 
[?25hCollecting llvmlite>=0.26.0dev0 (from numba>=0.37->umap-learn)
  Downloading https://files.pythonhosted.org/packages/33/a5/b8d25cbdfd94b5647f6107bb39802ce67afc098cc9cd9935f40415ff4def/llvmlite-0.26.0-cp35-cp35m-macosx_10_9_x86_64.whl (12.5MB)
[K    100% |████████████████████████████████| 12.5MB 59kB/s 
[?25hBuilding wheels for collected packages: umap-learn
  Running setup.py bdist_wheel for umap-learn ... [?25ldone
[?25h  Stored in directory: /Users/gracelee/Library/Cach

In [204]:
import umap
%time umap_data = umap.UMAP().fit_transform(tsne_data)

CPU times: user 3min 51s, sys: 6.86 s, total: 3min 58s
Wall time: 3min 58s


In [None]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, projection='3d')
plt.title('Decomposition using UMAP')

umap_df = pd.concat([pd.DataFrame(umap_data, columns=['pc1', 'pc2', 'pc3']), pd.DataFrame(dec_model, columns=['label'])], axis=1)

for l in umap_df.label.unique():
    clusterPoints = umap_df[umap_df.label == l]
    ax.scatter(clusterPoints.pc1, clusterPoints.pc2, clusterPoints.pc3)
    
ax.legend(umap_df.label.unique())    
    
plt.show()

In [None]:
pd.read_csv()

----

In [3]:
import pyLDAvis.gensim as genldavis

In [None]:
genldavis.prepare()