In this notebook we deal with the NLP involved in clustering the training data, and so create new top-level categories (only 10, instead of the 63 that the MSC offers).

In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.cluster import KMeans 
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

In [3]:
# All of this is done in 3-NLP classification
# - we're just packaging it here in a function.
def get_train_valid_test_split_from_pickles(rs=42, ts=0.3):
    
    with open('../thesis_msc_titled_filled_20190729.pickle', 'rb') as f:
        thesis_msc_titled_filled = pickle.load(f)
    with open('../cleaned_combined_msc_corpus_20190729.pickle', 'rb') as f:
        cleaned_combined_msc_corpus = pickle.load(f)
        
    # thesis_msc_titled_filled are for train and test.
    # thesis_msc_titled_unfilled are for actual classification.
    # msc titled: 220190 = 132116 filled + 88074 unfilled

    X = thesis_msc_titled_filled['thesis'].copy()
    y = thesis_msc_titled_filled['msc'].copy()
    feature_cols = ['thesis']
    
    X_tv, X_test, y_tv, y_test = train_test_split(X, y, random_state=rs, test_size=ts)
    X_train, X_valid, y_train, y_valid = train_test_split(X_tv, y_tv, 
                                                          random_state=rs, test_size=ts)
    print(f"Created train-validation-test split for X, y with sizes ", end="")
    print(f"train: {len(X_train)}, validation: {len(X_valid)}, test: {len(X_test)}.")

    # pipe.fit has problems with pandas data frames.
    X_train_list = list(X_train)
    X_valid_list = list(X_valid.values)
    X_test_list  = list(X_test.values)

    y_train_array = np.array(y_train)
    y_valid_array = np.array(y_valid)
    y_test_array  = np.array(y_test)

    # Add cleaned_combined_msc_corpus to the training set.
    for k, v in cleaned_combined_msc_corpus.items():
        # convert this list into a dictionary into a data frame
        # and append it to the training dataframe.
        X_train_list.extend(v)
        y_train_array = np.array(list(y_train_array) + [k]*len(v))
    
    print(f"After adding to train: train-validation-test split for X, y with sizes ", end="")
    print(f"train: {len(X_train)}, validation: {len(X_valid)}, test: {len(X_test)}.")

    return X_train_list, X_valid_list, X_test_list, y_train_array, y_valid_array, y_test_array

In [4]:
with open('../custom_stop_words.pickle', 'rb') as f:
    stop_words = pickle.load(f)
    
# interestingly enough, we keep getting German and French words coming out in topics.
more_stop_words = ['modles', 'berechnung', 'zum', 'quelques', 'contribution', 
                   'untersuchungen', 'thorie', 'tude', 'quations', 'etude', 
                   'aux', 'self', 'systmes', 'van', 'analyse']
stop_words = stop_words.union(more_stop_words)

In [5]:
X_train_list, X_valid_list, X_test_list, \
    y_train_array, y_valid_array, y_test_array = get_train_valid_test_split_from_pickles()

Created train-validation-test split for X, y with sizes train: 64741, validation: 27747, test: 39638.


In [128]:
len(X_train_list), len(X_valid_list), len(X_test_list), \
    len(y_train_array), len(y_valid_array), len(y_test_array)

(203580, 27747, 39638, 203580, 27747, 39638)

In [6]:
# https://towardsdatascience.com/limericking-part-2-topic-modeling-with-lda-45476ab9af15
# thanks Max!

# we run the LDA multiple times, each time we get a topic with more non-English words 
# to add to the stop words list.

#sklearn makes it easy to vectorize and perform LDA:
#cv = CountVectorizer(max_df=.8, min_df=3, stop_words='english')
cv = CountVectorizer(stop_words=stop_words)
vectors = cv.fit_transform(X_train_list)

LDA = LatentDirichletAllocation(n_components=10, n_jobs=-1, verbose=20)
LDA.fit(vectors)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   46.4s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   47.2s remaining:   47.2s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   47.6s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   47.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 1 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   40.1s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   42.0s remaining:   42.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   42.5s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   42.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 2 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   35.0s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   35.8s remaining:   35.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   37.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   37.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 3 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   39.1s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   39.7s remaining:   39.7s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   40.4s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   40.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 4 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   50.3s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   50.7s remaining:   50.7s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   52.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   52.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 5 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   42.1s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   43.3s remaining:   43.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   44.4s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   44.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 6 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   31.8s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   32.3s remaining:   32.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   32.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   32.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 7 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   24.0s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   24.6s remaining:   24.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   25.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   25.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 8 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   21.8s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   22.6s remaining:   22.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   23.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   23.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 9 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   26.9s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   27.6s remaining:   27.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   28.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   28.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 10 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   21.8s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   22.3s remaining:   22.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   22.8s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   22.8s finished


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=-1,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=20)

In [3]:
# pickle the LDA model!
#with open('cv_10topics.pickle', 'wb') as f:
#    pickle.dump(cv, f)
#with open('LDA_10topics.pickle', 'wb') as f:
#    pickle.dump(LDA, f)
#with open('vectors_10topics.pickle', 'wb') as f:
#    pickle.dump(vectors, f)
with open('cv_10topics.pickle', 'rb') as f:
    cv = pickle.load(f)
with open('LDA_10topics.pickle', 'rb') as f:
    LDA = pickle.load(f)
with open('vectors_10topics.pickle', 'rb') as f:
    vectors = pickle.load(f)

In [10]:
len(LDA.components_[0])

76860

In [4]:
num_keywords = 15
#And to demonstrate what the topics it finds look like:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP {num_keywords} WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-num_keywords:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['regression', 'based', 'using', 'time', 'inference', 'bayesian', 'applications', 'model', 'random', 'statistical', 'methods', 'estimation', 'analysis', 'data', 'models']


THE TOP 15 WORDS FOR TOPIC #1
['approach', 'algorithms', 'time', 'programming', 'analysis', 'methods', 'optimal', 'theory', 'processes', 'applications', 'optimization', 'stochastic', 'problems', 'control', 'systems']


THE TOP 15 WORDS FOR TOPIC #2
['decision', 'methods', 'waves', 'phase', 'models', 'fluid', 'mathematical', 'study', 'modeling', 'simulation', 'analysis', 'flows', 'numerical', 'dynamics', 'flow']


THE TOP 15 WORDS FOR TOPIC #3
['empirical', 'international', 'trade', 'analysis', 'growth', 'labor', 'markets', 'effects', 'financial', 'policy', 'theory', 'market', 'economics', 'economic', 'essays']


THE TOP 15 WORDS FOR TOPIC #4
['complex', 'hilbert', 'space', 'classes', 'algebraic', 'cohomology', 'modules', 'algebra', 'properties', 'curves', 'rings', 'surfaces', 'spaces', 

In [9]:
# now to save the topic keywords:
topic_keywords = []
for index,topic in enumerate(LDA.components_):
    topic_keywords.append([cv.get_feature_names()[i] for i in topic.argsort()[-num_keywords:]])

In [5]:
#with open('10topics_15keywords_20190729_3.pickle', 'wb') as f:
#    pickle.dump(topic_keywords, f)
with open('10topics_15keywords_20190729_3.pickle', 'rb') as f:
    LDA_topic_keywords = pickle.load(f)

In [42]:
lda_ft = LDA.fit_transform(vectors)  # before all we did was fit

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   48.2s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   49.0s remaining:   49.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   49.5s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   49.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 1 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   53.1s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   56.1s remaining:   56.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   57.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   57.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 2 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   41.5s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   41.5s remaining:   41.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   42.7s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   42.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 3 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   33.8s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   34.7s remaining:   34.7s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   35.3s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   35.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 4 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   33.3s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   34.0s remaining:   34.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   34.4s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   34.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 5 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   30.9s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   31.7s remaining:   31.7s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   32.4s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   32.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 6 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   33.7s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   33.8s remaining:   33.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   34.5s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   34.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 7 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   29.9s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   30.3s remaining:   30.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   30.7s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   30.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 8 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   28.4s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   29.0s remaining:   29.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   29.6s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   29.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 9 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   24.7s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   25.0s remaining:   25.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   25.5s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   25.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 10 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   21.3s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   21.4s remaining:   21.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   21.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   21.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   17.0s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   17.1s remaining:   17.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   17.5s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   17.5s finished


In [43]:
# Now to visualize the LDA via TSNE?

# https://shuaiw.github.io/2016/12/22/topic-modeling-and-tsne-visualzation.html

from sklearn.manifold import TSNE

# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
# possibly use TruncatedSVD next time? this is taking a while to run w/o verbosity

# many-many-D -> 2-D
tsne_lda = tsne_model.fit_transform(lda_ft)  # lda_ft is called X_topics in the tutorial

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 203580 samples in 0.278s...
[t-SNE] Computed neighbors for 203580 samples in 128.642s...
[t-SNE] Computed conditional probabilities for sample 1000 / 203580
[t-SNE] Computed conditional probabilities for sample 2000 / 203580
[t-SNE] Computed conditional probabilities for sample 3000 / 203580
[t-SNE] Computed conditional probabilities for sample 4000 / 203580
[t-SNE] Computed conditional probabilities for sample 5000 / 203580
[t-SNE] Computed conditional probabilities for sample 6000 / 203580
[t-SNE] Computed conditional probabilities for sample 7000 / 203580
[t-SNE] Computed conditional probabilities for sample 8000 / 203580
[t-SNE] Computed conditional probabilities for sample 9000 / 203580
[t-SNE] Computed conditional probabilities for sample 10000 / 203580
[t-SNE] Computed conditional probabilities for sample 11000 / 203580
[t-SNE] Computed conditional probabilities for sample 12000 / 203580
[t-SNE] Computed conditional proba

[t-SNE] Computed conditional probabilities for sample 118000 / 203580
[t-SNE] Computed conditional probabilities for sample 119000 / 203580
[t-SNE] Computed conditional probabilities for sample 120000 / 203580
[t-SNE] Computed conditional probabilities for sample 121000 / 203580
[t-SNE] Computed conditional probabilities for sample 122000 / 203580
[t-SNE] Computed conditional probabilities for sample 123000 / 203580
[t-SNE] Computed conditional probabilities for sample 124000 / 203580
[t-SNE] Computed conditional probabilities for sample 125000 / 203580
[t-SNE] Computed conditional probabilities for sample 126000 / 203580
[t-SNE] Computed conditional probabilities for sample 127000 / 203580
[t-SNE] Computed conditional probabilities for sample 128000 / 203580
[t-SNE] Computed conditional probabilities for sample 129000 / 203580
[t-SNE] Computed conditional probabilities for sample 130000 / 203580
[t-SNE] Computed conditional probabilities for sample 131000 / 203580
[t-SNE] Computed con

In [123]:
with open('tsne_lda_20190730.pickle', 'wb') as f:
    pickle.dump(tsne_lda, f)

In [125]:
with open('lda_ft_20190730.pickle', 'wb') as f:
    pickle.dump(lda_ft, f)

In [44]:
import numpy as np
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool

n_top_words = 15 # number of keywords we show

# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [47]:
_lda_keys = []
for i in range(lda_ft.shape[0]):
    _lda_keys +=  lda_ft[i].argmax(), 
    # the LDA key topic is the index with the largest probability

In [61]:
# also found this piece in https://pythonhosted.org/lda/

topic_summaries = []
#topic_word = LDA.topic_word_  # all topic words
topic_word = LDA.components_  # all topic words... but not really
vocab = cv.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
    topic_summaries.append(' '.join(topic_words)) # append!

In [127]:
#print(X_train_list[:10])
#print(_lda_keys[:10])
#colormap[_lda_keys][:num_example]
len(tsne_lda[:, 0]), len(X_train_list)

(203580, 203580)

In [460]:
X_train_list = list(np.array(X_train_list))
_lda_keys = list(np.array(_lda_keys))
x = tsne_lda[:, 0]
y = tsne_lda[:, 1]
colormap = colormap[_lda_keys]


# if we wish to see fewer points, use these "cuts"
threshold = 0.5
_idx = np.amax(lda_ft, axis=1) > threshold  # idx of doc that above the threshold
lda_ft_cut = lda_ft[_idx]

X_train_list_cut = list(np.array(X_train_list)[_idx])
_lda_keys_cut = list(np.array(_lda_keys)[_idx])
x_cut = tsne_lda[:, 0][_idx]
y_cut = tsne_lda[:, 1][_idx]
colormap_cut = colormap[_lda_keys][_idx]

In [499]:
title = 'Mathematics Genealogy Project: LDA t-SNE PCA of 10 topics in 2 dimensions'
filename = 'MGP_topics_LDA_viz'
num_example = len(lda_ft)

plot_lda = bp.figure(plot_width=1200, plot_height=1000,
                     title=title,
#                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     tools="pan,wheel_zoom,box_zoom,reset,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y', fill_color='color', fill_alpha=0.6, line_color=None,
                 source=bp.ColumnDataSource({
                   "x": x, "y": y, "color": colormap,
                   "content": X_train_list, # all of it was [:num_example]
                   "topic_key": _lda_keys
                   }))

In [500]:
# randomly choose a title (within a topic) coordinate as the crucial words coordinate
topic_coord = np.empty((lda_ft.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
    if not np.isnan(topic_coord).any():
        break
    topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in range(lda_ft.shape[1]):
    split_topic = topic_summaries[i].split(' ')
    half = len(split_topic) // 2
    print_text = ' '.join(split_topic[:half]) + '\n' + ' '.join(split_topic[half:])
#    plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])
    plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [print_text])
#    print(print_text)

    
# hover tools
#hover = plot_lda.select(dict(type=HoverTool))
#hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
save(plot_lda, '{}.html'.format(filename))

'/Users/michael/Dropbox/code/Flatiron School/042219/mod7-final/write-up/MGP_topics_LDA_viz.html'

In [12]:
# Another way we can do clustering is with k-means clustering. 
# Let's see if we get similar results.

# k-means clustering for vectorized documents, with truncated SVD

kmeans_steps = [('count', CountVectorizer(stop_words=stop_words)),     # vectorize,
                ('svd', TruncatedSVD(random_state=42)),                # reduce dimensionality, 
                ('kmeans', KMeans(n_clusters=10, n_jobs=-1, n_init=10, # then cluster
                                  random_state=42, verbose=10))]
kmeans_pipe = Pipeline(steps=kmeans_steps)

In [13]:
kmeans_pipe.fit(X_train_list) # Run the clustering algorithm
cluster_assignments = kmeans_pipe.predict(X_valid_list) # Generate cluster index values for each row in df

In [14]:
len(cluster_assignments), len(X_valid_list) 
# we have 10 clusters for the validation set.
# we can spot check them to decide what these clusters are.

(27747, 27747)

In [15]:
X_valid_cluster_df = pd.DataFrame(data={'thesis': X_valid_list, 'cluster': cluster_assignments})

In [16]:
#X_valid_cluster_df.head()
#X_valid_cluster_df[X_valid_cluster_df['cluster']==6]

While there is no good way to visualize k-means, we can compare the overlap of different topics.

Let's get the clusters for X_test_list for both k-means and LDA.

In [511]:
cluster_test_kmeans = kmeans_pipe.predict(X_test_list)

In [522]:
vectors_test = cv.fit_transform(X_test_list)
lda_test = LDA.fit_transform(vectors_test)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   10.3s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   10.4s remaining:   10.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   10.5s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   10.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 1 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    5.7s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    5.8s remaining:    5.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    5.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    5.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 2 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    4.8s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    4.8s remaining:    4.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    5.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    5.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 3 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    3.6s remaining:    3.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.7s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 4 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    4.1s remaining:    4.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    4.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 5 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    3.8s remaining:    3.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 6 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    3.2s remaining:    3.2s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 7 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    4.2s remaining:    4.2s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    4.3s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    4.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 8 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    3.4s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    3.4s remaining:    3.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.5s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 9 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    3.0s remaining:    3.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


iteration: 10 of max_iter: 10


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    3.4s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    3.5s remaining:    3.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.5s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    2.1s remaining:    2.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.2s finished


In [527]:
cluster_test_lda = []
for i in range(lda_test.shape[0]):
    cluster_test_lda += lda_test[i].argmax(),
    # the LDA key topic is the index with the largest probability

In [528]:
len(cluster_test_kmeans), len(cluster_test_lda)

(39638, 39638)

In [536]:
cluster_test_kmeans[:20]

array([5, 4, 8, 0, 1, 4, 8, 8, 0, 8, 0, 4, 8, 5, 4, 8, 0, 1, 0, 8],
      dtype=int32)

In [532]:
cluster_test_lda[:20]

[4, 7, 9, 0, 1, 7, 9, 9, 3, 7, 7, 7, 9, 3, 8, 9, 2, 3, 6, 6]

There are 39,638 titles in the testing set. On how many do LDA and k-means agree on clustering?

In [546]:
# build a list of lists holding the indices of each cluster
cluster_test_kmeans_index = []
cluster_test_lda_index = []
for i in range(10):
    cluster_test_kmeans_index.append(set())  # use sets, b/c indices are unique
    cluster_test_lda_index.append(set())

In [547]:
for i in range(len(cluster_test_kmeans)):
    cluster_test_kmeans_index[cluster_test_kmeans[i]].add(i)
    cluster_test_lda_index[cluster_test_lda[i]].add(i)

In [552]:
# Now we have the index lists for each of the 10 clusters 
# in each of the two clustering methods.

# Now we compare the intersections of each pairing and find 
# the highest percentages, to measure how much the two 
# clusterings "agree".

count_overlap = []
# count_overlap[i][j] contains a triple:
# (len(kmeans[i]), len(lda[j]), len(intersection)))

for i in range(10):
    count_overlap.append([])
    for j in range(10):
        intersect = cluster_test_kmeans_index[i].intersection(cluster_test_lda_index[j])
        count_overlap[i].append((len(cluster_test_kmeans_index[i]), 
                                 len(cluster_test_lda_index[j]), 
                                 len(intersect)))

In [554]:
# Which of the LDA's clusters does the first cluster of k-means agree with?
count_overlap[0]
# Seemingly, none of them.

[(7706, 3794, 987),
 (7706, 2754, 514),
 (7706, 5749, 1344),
 (7706, 8932, 2063),
 (7706, 2732, 390),
 (7706, 3260, 551),
 (7706, 2768, 515),
 (7706, 3837, 521),
 (7706, 2657, 375),
 (7706, 3155, 446)]

We see that the two clustering methods came up with completely different clusters.

In [555]:
with open('kmeans_lda_cluster_overlap_sizes.pickle', 'wb') as f:
    pickle.dump(count_overlap, f)