In [7]:
import re
import lda
import json
import spacy
import warnings
import numpy as np
import pandas as pd
import pyLDAvis.sklearn

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import pairwise_distances
from nltk.corpus import stopwords

def tokenize(string):
    doc = model(string)
    words = []
    for w in doc:
        if w.pos_ not in discard and len(w.lemma_) > 1:
            words.append(w.lemma_ + "_" + w.pos_)
    return words

stopwords = stopwords.words('english')
warnings.simplefilter('ignore')
reg = re.compile(r'[0-9]+')
discard = {'PUNCT', 'SPACE', 'SYM', 'NUM', 'X', 'PRON', 'DET', 'PART', 'ADP'}

pyLDAvis.enable_notebook()
model = spacy.load('en')

In [12]:
data = fetch_20newsgroups(categories=['rec.autos','rec.motorcycles', 'sci.crypt', 
                                      'sci.electronics','sci.med','sci.space','talk.politics.guns',
                                      'talk.politics.mideast','talk.religion.misc'])

data.data = [reg.sub("", i) for i in data.data]

vectorizer = CountVectorizer(stop_words=stopwords, max_df=0.3, min_df=10, lowercase=False, tokenizer=tokenize)
vectorized_data = vectorizer.fit_transform(data.data)

In [107]:
model = lda.LDA(n_topics=20, n_iter=2500)
transformed_data = model.fit_transform(vectorized_data)

INFO:lda:n_documents: 5052
INFO:lda:vocab_size: 8074
INFO:lda:n_words: 667023
INFO:lda:n_topics: 20
INFO:lda:n_iter: 2500
INFO:lda:<0> log likelihood: -8075724
INFO:lda:<10> log likelihood: -6085954
INFO:lda:<20> log likelihood: -5806069
INFO:lda:<30> log likelihood: -5708973
INFO:lda:<40> log likelihood: -5652210
INFO:lda:<50> log likelihood: -5616562
INFO:lda:<60> log likelihood: -5591478
INFO:lda:<70> log likelihood: -5571757
INFO:lda:<80> log likelihood: -5558738
INFO:lda:<90> log likelihood: -5547197
INFO:lda:<100> log likelihood: -5538730
INFO:lda:<110> log likelihood: -5530909
INFO:lda:<120> log likelihood: -5523478
INFO:lda:<130> log likelihood: -5519177
INFO:lda:<140> log likelihood: -5510194
INFO:lda:<150> log likelihood: -5503387
INFO:lda:<160> log likelihood: -5501162
INFO:lda:<170> log likelihood: -5498268
INFO:lda:<180> log likelihood: -5493835
INFO:lda:<190> log likelihood: -5491690
INFO:lda:<200> log likelihood: -5488625
INFO:lda:<210> log likelihood: -5487304
INFO:lda:

INFO:lda:<2000> log likelihood: -5459852
INFO:lda:<2010> log likelihood: -5460463
INFO:lda:<2020> log likelihood: -5461173
INFO:lda:<2030> log likelihood: -5461748
INFO:lda:<2040> log likelihood: -5460525
INFO:lda:<2050> log likelihood: -5461119
INFO:lda:<2060> log likelihood: -5462224
INFO:lda:<2070> log likelihood: -5459958
INFO:lda:<2080> log likelihood: -5459628
INFO:lda:<2090> log likelihood: -5460618
INFO:lda:<2100> log likelihood: -5459756
INFO:lda:<2110> log likelihood: -5460554
INFO:lda:<2120> log likelihood: -5459299
INFO:lda:<2130> log likelihood: -5461376
INFO:lda:<2140> log likelihood: -5461082
INFO:lda:<2150> log likelihood: -5462392
INFO:lda:<2160> log likelihood: -5460230
INFO:lda:<2170> log likelihood: -5462199
INFO:lda:<2180> log likelihood: -5461663
INFO:lda:<2190> log likelihood: -5460601
INFO:lda:<2200> log likelihood: -5461062
INFO:lda:<2210> log likelihood: -5459802
INFO:lda:<2220> log likelihood: -5459434
INFO:lda:<2230> log likelihood: -5460776
INFO:lda:<2240> 

In [21]:
pylda = pyLDAvis.sklearn.prepare(model, vectorized_data, vectorizer, mds='mmds')
#pylda

In [134]:
top = {}
for i in range(1, 11):
    top[i] = pylda.token_table[pylda.token_table['Topic'] == i].sort_values('Freq', ascending=False)
    
for i in range(1,11):
    print('Topic:', i)
    print('Top words:', list(top[i]['Term'].head(10)))
    print()

Topic: 1
Top words: ['quite_ADJ', 'logical_ADJ', 'influence_NOUN', 'confuse_VERB', 'convince_VERB', 'whatsoever_ADV', 'nevertheless_ADV', 'fashion_NOUN', 'inherently_ADV', 'accusation_NOUN']

Topic: 2
Top words: ['smash_VERB', 'scratch_VERB', 'upstairs_ADV', 'accounts_PROPN', 'violations_PROPN', 'azerbaijanis_PROPN', 'yerevan_PROPN', 'hall_NOUN', 'apartment_NOUN', 'dseg_PROPN']

Topic: 3
Top words: ['annual_PROPN', 'isbn_PROPN', 'convention_PROPN', 'commission_PROPN', 'participation_NOUN', 'chair_PROPN', 'transportation_PROPN', 'assembly_PROPN', 'staff_PROPN', 'editor_PROPN']

Topic: 4
Top words: ['crypto_ADJ', 'crypto_ADV', 'cellular_ADJ', 'denning_PROPN', 'toal_PROPN', 'decrypt_VERB', 'escrow_VERB', 'encryption_NOUN', 'clipper_NOUN', 'encryption_PROPN']

Topic: 5
Top words: ['fm_PROPN', 'sat_PROPN', 'simms_PROPN', 'bulletin_PROPN', 'compuserve_PROPN', 'williams_PROPN', 'brookline_PROPN', 'barry_PROPN', 'mailing_PROPN', 'babb_PROPN']

Topic: 6
Top words: ['odometer_NOUN', 'ak@yfn.ysu.

In [109]:
distances = pairwise_distances(transformed_data, metric='cosine')
np.fill_diagonal(distances, np.inf)
distances

array([[       inf, 0.96506414, 0.74076223, ..., 0.6835988 , 0.74007829,
        0.73183347],
       [0.96506414,        inf, 0.93861441, ..., 0.96581955, 0.80646567,
        0.96829524],
       [0.74076223, 0.93861441,        inf, ..., 0.89028857, 0.88864282,
        0.91518388],
       ...,
       [0.6835988 , 0.96581955, 0.89028857, ...,        inf, 0.70277978,
        0.63332169],
       [0.74007829, 0.80646567, 0.88864282, ..., 0.70277978,        inf,
        0.68347683],
       [0.73183347, 0.96829524, 0.91518388, ..., 0.63332169, 0.68347683,
               inf]])

In [None]:
score = np.mean(data.target[np.argpartition(distances, 99)[:,:100]] == data.target[:,None])
print('Top 100 mAP is {}'.format(score))

In [101]:
scores = {}
for i in range(len(distances)):
    if data.target[i] not in scores:
        scores[data.target[i]] = []
    scores[data.target[i]].append(np.mean(data.target[np.argsort(distances[i])[:100]] == data.target[i]))
    
for i in scores.keys():
    scores[i] = np.mean(scores[i])
    
dataframe = pd.DataFrame(list(scores.values()), index=scores.keys())
dataframe.index = [data.target_names[i] for i in dataframe.index]
dataframe.columns = ['Top 100 AP']
dataframe.sort_values('Top 100 AP', ascending=False)

Unnamed: 0,Top 100 AP
talk.politics.mideast,0.825762
sci.crypt,0.797966
rec.motorcycles,0.775
sci.electronics,0.733384
sci.space,0.729933
sci.med,0.694327
talk.politics.guns,0.686722
rec.autos,0.686684
talk.religion.misc,0.59244
talk.politics.misc,0.467376


In [153]:
data.target[0]

5

In [151]:
data.target[np.argsort(distances[0])[1:101]] == 5

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [152]:
np.mean(data.target[np.argsort(distances[0])[1:101]] == 5)

0.99