In [3]:
import pandas as pd
import os

os.chdir('..')

#read data into papers
papers = pd.read_csv(r'C:\Users\Nalin\Desktop\papers.csv')
papers.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [4]:
#remove the columns
papers = papers.drop(columns=['id','title','abstract','event_type','pdf_name','year'],axis=1)

#sample only 100 papers
papers = papers.sample(100)

#print out the first rows of papers
papers.head()

Unnamed: 0,paper_text
1925,Brain Inspired Reinforcement Learning\nFran?oi...
5621,Dimension-Free Iteration Complexity of Finite ...
4085,Efficient Spike-Coding with Multiplicative\nAd...
1576,An MDP-Based Approach to Online\nMechanism Des...
1483,Finding the M Most Probable\nConfigurations Us...


In [5]:
#load the regular expression library
import re

#remove punctuation
papers['paper_text_processed'] = papers['paper_text'].map(lambda x: re.sub('[,\.!?]','',x))

#convert the titles to lowercase
papers['paper_text_processed'] = papers['paper_text_processed'].map(lambda x: x.lower())

#print out the first rows of papers
papers['paper_text_processed'].head()

1925    brain inspired reinforcement learning\nfranois...
5621    dimension-free iteration complexity of finite ...
4085    efficient spike-coding with multiplicative\nad...
1576    an mdp-based approach to online\nmechanism des...
1483    finding the m most probable\nconfigurations us...
Name: paper_text_processed, dtype: object

In [6]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) #deacc=True removes punctuations
        
data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['brain', 'inspired', 'reinforcement', 'learning', 'franois', 'rivest', 'yoshua', 'bengio', 'dpartement', 'dinformatique', 'et', 'de', 'recherche', 'oprationnelle', 'universit', 'de', 'montral', 'cp', 'succ', 'centre', 'ville', 'montral', 'qc', 'canada', 'francoisrivest', 'mailmcgillca', 'bengioy', 'iroumontrealca', 'john', 'kalaska']


In [7]:
#build the bigram and trigram models
bigram = gensim.models.Phrases(data_words,min_count=5,threshold=100) #higher threshold fewer phrases
trigram = gensim.models.Phrases(bigram[data_words],threshold=100)

#faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [8]:
#nltk stowords
#import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from','subject','re','edu','use'])

#define functions for stopwords, bigrams, trigrsms and lemmatization

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts,allowed_postags=['NOUN','ADJ','VERB','ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:
import spacy

#remove stop words
data_words_nostops = remove_stopwords(data_words)

#from bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

#initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser','ner'])

#do lemmtization keeping only noun ,adj ,vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN','ADJ','VERB','ADV'])

print(data_lemmatized[:1])

[['brain', 'inspire', 'reinforcement', 'learning', 'franois', 'rivest', 'yoshua_bengio', 'dpartement', 'dinformatique', 'recherche', 'oprationnelle', 'universit', 'montral', 'succ', 'centre', 'ville', 'montral', 'qc', 'canada', 'francoisriv', 'mailmcgillca', 'bengioy', 'john', 'kalaska', 'dpartement', 'physiologie', 'universit', 'montral', 'kalaskaj', 'abstract', 'successful', 'application', 'reinforcement', 'learning', 'algorithm', 'often', 'involve', 'considerable', 'hand', 'craft', 'necessary', 'non', 'linear', 'feature', 'reduce', 'complexity', 'value', 'function', 'hence', 'promote', 'convergence', 'algorithm', 'contrast', 'human', 'brain', 'readily', 'autonomously', 'find', 'complex', 'feature', 'provide', 'sufficient', 'training', 'recent', 'work', 'machine', 'learning', 'neurophysiology', 'demonstrate', 'role', 'basal_ganglia', 'frontal_cortex', 'mammalian', 'reinforcement', 'learning', 'paper', 'develop', 'explore', 'new', 'reinforcement', 'learning', 'algorithm', 'inspire', '

In [10]:
import gensim.corpora as corpora

#create dictionary
id2word = corpora.Dictionary(data_lemmatized)

#create corpus
texts = data_lemmatized

#term document frequency
corpus = [id2word.doc2bow(text) for text in texts]

#view
print(corpus[:1])

[[(0, 1), (1, 1), (2, 3), (3, 1), (4, 5), (5, 2), (6, 12), (7, 1), (8, 7), (9, 8), (10, 7), (11, 7), (12, 5), (13, 3), (14, 1), (15, 1), (16, 1), (17, 2), (18, 17), (19, 2), (20, 1), (21, 13), (22, 1), (23, 1), (24, 4), (25, 2), (26, 1), (27, 1), (28, 1), (29, 2), (30, 3), (31, 2), (32, 1), (33, 4), (34, 4), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 1), (41, 1), (42, 4), (43, 1), (44, 1), (45, 3), (46, 10), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 4), (53, 1), (54, 2), (55, 19), (56, 6), (57, 10), (58, 1), (59, 1), (60, 1), (61, 2), (62, 1), (63, 1), (64, 2), (65, 2), (66, 1), (67, 5), (68, 1), (69, 1), (70, 1), (71, 1), (72, 3), (73, 1), (74, 2), (75, 1), (76, 1), (77, 3), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 2), (84, 2), (85, 2), (86, 1), (87, 7), (88, 1), (89, 2), (90, 5), (91, 1), (92, 1), (93, 1), (94, 1), (95, 2), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 3), (104, 1), (105, 3), (106, 1), (107, 1), (108, 1), (109, 2), (1

In [11]:
#build lda model
lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word,num_topics=10,random_state=100,chunksize=100,passes=10,per_word_topics=True)

In [12]:
#create dictionary
id2word = corpora.Dictionary(data_lemmatized)

#create corpus
texts = data_lemmatized

#term document frequency
corpus = [id2word.doc2bow(text) for text in texts]

#view
print(corpus[:1])

[[(0, 1), (1, 1), (2, 3), (3, 1), (4, 5), (5, 2), (6, 12), (7, 1), (8, 7), (9, 8), (10, 7), (11, 7), (12, 5), (13, 3), (14, 1), (15, 1), (16, 1), (17, 2), (18, 17), (19, 2), (20, 1), (21, 13), (22, 1), (23, 1), (24, 4), (25, 2), (26, 1), (27, 1), (28, 1), (29, 2), (30, 3), (31, 2), (32, 1), (33, 4), (34, 4), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 1), (41, 1), (42, 4), (43, 1), (44, 1), (45, 3), (46, 10), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 4), (53, 1), (54, 2), (55, 19), (56, 6), (57, 10), (58, 1), (59, 1), (60, 1), (61, 2), (62, 1), (63, 1), (64, 2), (65, 2), (66, 1), (67, 5), (68, 1), (69, 1), (70, 1), (71, 1), (72, 3), (73, 1), (74, 2), (75, 1), (76, 1), (77, 3), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 2), (84, 2), (85, 2), (86, 1), (87, 7), (88, 1), (89, 2), (90, 5), (91, 1), (92, 1), (93, 1), (94, 1), (95, 2), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 3), (104, 1), (105, 3), (106, 1), (107, 1), (108, 1), (109, 2), (1

In [13]:
id2word[0]

'able'

In [14]:
#human readable format of corpus (term-frequency)
[[(id2word[id],freq) for id, freq in cp] for cp in corpus[:1]]

[[('able', 1),
  ('abstract', 1),
  ('achieve', 3),
  ('acknowledgment', 1),
  ('acrobot', 5),
  ('act', 2),
  ('action', 12),
  ('activate', 1),
  ('activation', 7),
  ('activity', 8),
  ('actor', 7),
  ('actor_critic', 7),
  ('adaptive', 5),
  ('add', 3),
  ('address', 1),
  ('advance', 1),
  ('advantage', 1),
  ('agent', 2),
  ('algorithm', 17),
  ('allow', 2),
  ('already', 1),
  ('also', 13),
  ('amari', 1),
  ('analog', 1),
  ('analysis', 4),
  ('angle', 2),
  ('angular', 1),
  ('anticipatory', 1),
  ('application', 1),
  ('apply', 2),
  ('approach', 3),
  ('approximate', 2),
  ('approximation', 1),
  ('approximator', 4),
  ('architecture', 4),
  ('area', 1),
  ('argmax', 1),
  ('argue', 1),
  ('arise', 1),
  ('assess', 2),
  ('associate', 1),
  ('assume', 1),
  ('attempt', 4),
  ('autonomous', 1),
  ('autonomously', 1),
  ('available', 3),
  ('average', 10),
  ('avoid', 1),
  ('away', 1),
  ('back', 1),
  ('background', 1),
  ('backprop', 1),
  ('backpropagation', 4),
  ('bad', 

In [15]:
#build the lda model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word = id2word,
                                            num_topics = 20,
                                            random_state=100,
                                            update_every = 1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

In [16]:
from pprint import pprint

#print the keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.021*"agent" + 0.013*"mechanism" + 0.013*"value" + 0.009*"learn" + '
  '0.009*"model" + 0.008*"decision" + 0.008*"learning" + 0.008*"song" + '
  '0.008*"system" + 0.008*"mdp"'),
 (1,
  '0.023*"transformation" + 0.019*"model" + 0.017*"image" + 0.010*"set" + '
  '0.010*"use" + 0.010*"cluster" + 0.009*"mean" + 0.009*"fig" + 0.009*"show" + '
  '0.008*"mixture"'),
 (2,
  '0.018*"policy" + 0.013*"despot" + 0.011*"algorithm" + 0.010*"belief" + '
  '0.009*"state" + 0.009*"sample" + 0.009*"value" + 0.009*"action" + '
  '0.008*"distribution" + 0.007*"tree"'),
 (3,
  '0.013*"distribution" + 0.013*"factor" + 0.011*"variable" + 0.008*"gradient" '
  '+ 0.007*"network" + 0.007*"bit" + 0.007*"qsgd" + 0.006*"algorithm" + '
  '0.006*"stochastic" + 0.006*"communication"'),
 (4,
  '0.011*"bidder" + 0.010*"network" + 0.009*"depth" + 0.007*"image" + '
  '0.007*"model" + 0.006*"use" + 0.006*"auction" + 0.006*"round" + '
  '0.005*"scale" + 0.005*"bid"'),
 (5,
  '0.016*"learn" + 0.013*"feature" + 0.01

In [17]:
from gensim.models import CoherenceModel

#compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model,texts=data_lemmatized,dictionary=id2word,coherence='c_v')
coherence_lda= coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.32418326935807473


In [18]:
#supporting function
def compute_coherence_values(corpus,dictionary,k,a,b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=dictionary,num_topics=k,random_state=100,chunksize=100,passes=10,alpha=a,eta=b)
    coherence_model_lda= CoherenceModel(model=lda_model,texts=data_lemmatized,dictionary=id2word,coherence='c_v')
    return coherence_model_lda.get_coherence()

In [19]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set']  ={}

#Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics,max_topics,step_size)

#alpha parameter
alpha = list(np.arange(0.01,1,0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

#beta parameter
beta = list(np.arange(0.01,1,0.3))
beta.append('symmetric')

#validation sets
num_of_docs = len(corpus)
corpus_sets = [ gensim.utils.ClippedCorpus(corpus,int(num_of_docs*0.75)),corpus]
corpus_title = ['75% Corpus','100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []}

#can take a long time to run
if 1==1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    #iterate through validation corpuses
    for i in range(len(corpus_sets)):
        #iterate through number of topics
        for k in topics_range:
            #iterate through alpha values
            for a in alpha:
                #iterate through beta values
                for b in beta:
                    #get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i],dictionary=id2word,k=k,a=a,b=b)
                    #save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv(r'C:\Users\Nalin\Desktop\lda_tuning_results.csv',index=False)
    pbar.close()

100%|████████████████████████████████████████████████████████████████████████████| 540/540 [25:29:04<00:00, 169.90s/it]


In [22]:
num_topics = 8

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=num_topics,
                                          random_state=100,
                                          chunksize=100,
                                          passes=10,
                                          alpha=0.01,
                                          eta=0.9)

In [23]:
from pprint import pprint 

#print th keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.005*"object" + 0.004*"action" + 0.004*"search" + 0.003*"song" + '
  '0.003*"window" + 0.003*"proposal" + 0.002*"iou" + 0.002*"syllable" + '
  '0.002*"recall" + 0.002*"learn"'),
 (1,
  '0.002*"cell" + 0.002*"fire" + 0.001*"firing" + 0.001*"die" + 0.001*"array" '
  '+ 0.001*"tion" + 0.001*"spontaneous" + 0.000*"trap" + 0.000*"transi" + '
  '0.000*"neighborhood"'),
 (2,
  '0.005*"policy" + 0.005*"bidder" + 0.004*"despot" + 0.004*"belief" + '
  '0.003*"action" + 0.003*"state" + 0.003*"auction" + 0.002*"rule" + '
  '0.002*"agent" + 0.002*"search"'),
 (3,
  '0.002*"wattle" + 0.001*"chip" + 0.001*"synapse" + 0.001*"gain" + '
  '0.001*"analogue" + 0.001*"neuron" + 0.001*"jabri" + 0.001*"current" + '
  '0.000*"transient" + 0.000*"energy"'),
 (4,
  '0.010*"network" + 0.004*"learn" + 0.004*"model" + 0.004*"image" + '
  '0.004*"layer" + 0.003*"error" + 0.003*"use" + 0.003*"theorem" + '
  '0.003*"gradient" + 0.003*"show"'),
 (5,
  '0.008*"learn" + 0.007*"network" + 0.006*"image" + 0.006*"

In [32]:
import pyLDAvis.gensim_models as gensimvis
import pickle
import pyLDAvis

#visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = gensimvis.prepare(lda_model,corpus,id2word)

LDAvis_prepared

  and should_run_async(code)


[(0,
  '0.060*"team" + 0.058*"game" + 0.038*"play" + 0.036*"year" + 0.033*"win" + '
  '0.019*"season" + 0.018*"fan" + 0.016*"goal" + 0.015*"nhl" + 0.014*"score"'),
 (1,
  '0.107*"space" + 0.034*"earth" + 0.028*"launch" + 0.025*"moon" + '
  '0.025*"nasa" + 0.024*"mission" + 0.024*"orbit" + 0.020*"satellite" + '
  '0.016*"brown" + 0.015*"flight"'),
 (2,
  '0.043*"md" + 0.042*"richard" + 0.039*"hole" + 0.032*"motto" + 0.031*"mu" + '
  '0.030*"quick" + 0.026*"insert" + 0.024*"strip" + 0.018*"caltech" + '
  '0.017*"rs"'),
 (3,
  '0.034*"government" + 0.030*"israel" + 0.026*"public" + 0.025*"state" + '
  '0.018*"american" + 0.015*"israeli" + 0.013*"country" + 0.013*"security" + '
  '0.013*"encryption" + 0.012*"right"'),
 (4,
  '0.044*"line" + 0.040*"organization" + 0.028*"write" + 0.024*"article" + '
  '0.019*"get" + 0.017*"nntp_poste" + 0.016*"university" + 0.015*"host" + '
  '0.010*"reply" + 0.009*"problem"'),
 (5,
  '0.097*"gun" + 0.078*"law" + 0.031*"crime" + 0.031*"weapon" + '
  '0.030*

TypeError: '<' not supported between instances of 'int' and 'tuple'