## LDA

In [24]:
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer

# Clean up for LDA
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

In [4]:
#check dataset
papers = pd.read_csv('../data/papers.csv')
print('Size of data: ', papers.shape)
papers.head()

Size of data:  (7241, 7)


Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


## Data Cleaning
- Might need to consider removing latex symbol and numbers

In [8]:
#replace abstract missing with na
papers.abstract = papers.abstract.apply(lambda x: np.nan if 'Abstract Missing' else x)

#replace \n with space
def getSpace(x):
    for i in range(len(x)):
        try:
            x[i] = x[i].replace('\n', ' ')
        except:
            x[i] = x[i]
    return x

papers[['abstract','paper_text']] = papers[['abstract','paper_text']].apply(lambda x: getSpace(x), axis = 1)

Cleaning to be done on "paper_text":
1. Remove Title (anything before "ABSTRACT")
2. Remove LaTeX or math symbols
3. Remove References (anything after "REFERENCES")

In [9]:
papers.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,,767 SELF-ORGANIZATION OF ASSOCIATIVE DATABASE...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,,683 A MEAN FIELD THEORY OF LAYER IV OF VISUAL...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,,394 STORING COVARIANCE BY THE ASSOCIATIVE LON...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,,Bayesian Query Construction for Neural Network...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,,"Neural Network Ensembles, Cross Validation, an..."


In [17]:
flatten = lambda l: [item for sublist in l for item in sublist]
stop_words = set(stopwords.words('english'))
stop_words.update(['one', 'time', 'two', 'number', 'figure', 'problem', 'also'])

tokenizer = RegexpTokenizer(r'[\w\']+')

### Using a subset of the paper to try out models

In [36]:
papers_subset = papers.sample(frac=0.1, replace=False, random_state=33)

In [37]:
# Clean individual paper_text
paper_text_list = list(papers_subset['paper_text'])
for i in range(len(paper_text_list)):
    paper_text_list[i] = paper_text_list[i].lower()
    paper_text_list[i] = tokenizer.tokenize(paper_text_list[i])
    paper_text_list[i] = [j for j in paper_text_list[i] if j not in stop_words]


In [38]:
# Count max occurrence word within the corpus
countvec = CountVectorizer()
X = countvec.fit_transform(flatten(paper_text_list))
Xsum = X.sum(axis=0)
words_freq = [(word, Xsum[0, idx]) for word, idx in countvec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
words_freq[:50]

[('learning', 11445),
 ('model', 10079),
 ('data', 9675),
 ('algorithm', 8868),
 ('set', 7575),
 ('function', 7408),
 ('10', 7212),
 ('time', 6563),
 ('using', 6416),
 ('one', 6400),
 ('two', 5911),
 ('problem', 5437),
 ('number', 5406),
 ('figure', 5385),
 ('training', 4842),
 ('also', 4724),
 ('models', 4651),
 ('results', 4576),
 ('used', 4534),
 ('given', 4519),
 ('matrix', 4426),
 ('based', 4317),
 ('distribution', 4251),
 ('first', 4145),
 ('neural', 4135),
 ('log', 4113),
 ('information', 4017),
 ('error', 3953),
 ('network', 3950),
 ('state', 3946),
 ('use', 3794),
 ('linear', 3749),
 ('methods', 3744),
 ('method', 3723),
 ('different', 3707),
 ('algorithms', 3535),
 ('input', 3505),
 ('performance', 3439),
 ('probability', 3408),
 ('approach', 3382),
 ('parameters', 3272),
 ('case', 3220),
 ('let', 3169),
 ('value', 3137),
 ('space', 3133),
 ('12', 3071),
 ('random', 3022),
 ('section', 2992),
 ('20', 2981),
 ('11', 2958)]

## LDA model

In [39]:
#your code here
paper_dict = Dictionary(paper_text_list)
paper_corpus = [paper_dict.doc2bow(paper) for paper in paper_text_list]
# coherence_vals = []
# for ntop in range(1,15):
#     mod = LdaModel(tweets_corpus, num_topics = ntop, 
#                    id2word = tweets_dict, passes=20)
    
#     cmod = CoherenceModel(model=mod, corpus=tweets_corpus, 
#                           dictionary=tweets_dict, coherence='u_mass')
    
#     cval = cmod.get_coherence()
#     print(ntop,cval)
#     coherence_vals.append(cval)
    
# opt_topics = np.argmax(coherence_vals) + 1
# print('The optimal number of topics is {}.'.format(opt_topics))

In [52]:
paper_dict.get("machine")

In [40]:
paper_corpus[0]

[(0, 17),
 (1, 1),
 (2, 1),
 (3, 112),
 (4, 8),
 (5, 4),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 4),
 (11, 1),
 (12, 10),
 (13, 2),
 (14, 12),
 (15, 1),
 (16, 1),
 (17, 2),
 (18, 2),
 (19, 1),
 (20, 1),
 (21, 4),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 2),
 (27, 2),
 (28, 1),
 (29, 3),
 (30, 3),
 (31, 2),
 (32, 3),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 3),
 (37, 1),
 (38, 97),
 (39, 2),
 (40, 2),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 2),
 (45, 1),
 (46, 2),
 (47, 5),
 (48, 1),
 (49, 5),
 (50, 4),
 (51, 2),
 (52, 5),
 (53, 2),
 (54, 1),
 (55, 1),
 (56, 2),
 (57, 1),
 (58, 2),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 2),
 (67, 2),
 (68, 26),
 (69, 2),
 (70, 1),
 (71, 2),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 1),
 (76, 19),
 (77, 1),
 (78, 16),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 13),
 (85, 2),
 (86, 1),
 (87, 7),
 (88, 1),
 (89, 1),
 (90, 1),
 (91, 9),
 (92, 2),
 (93, 2),
 (94, 1),
 (95, 1),
 (96, 8),
 (97, 1),
 (98, 8),
 (99, 4),


In [41]:
len(paper_corpus)

724

In [42]:
num_topic = 10
ldamod = LdaModel(paper_corpus, num_topics=num_topic, id2word = paper_dict, passes=20)

In [45]:
type(paper_corpus[0])

list

In [46]:
paper_corpus[0]

[(0, 17),
 (1, 1),
 (2, 1),
 (3, 112),
 (4, 8),
 (5, 4),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 4),
 (11, 1),
 (12, 10),
 (13, 2),
 (14, 12),
 (15, 1),
 (16, 1),
 (17, 2),
 (18, 2),
 (19, 1),
 (20, 1),
 (21, 4),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 2),
 (27, 2),
 (28, 1),
 (29, 3),
 (30, 3),
 (31, 2),
 (32, 3),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 3),
 (37, 1),
 (38, 97),
 (39, 2),
 (40, 2),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 2),
 (45, 1),
 (46, 2),
 (47, 5),
 (48, 1),
 (49, 5),
 (50, 4),
 (51, 2),
 (52, 5),
 (53, 2),
 (54, 1),
 (55, 1),
 (56, 2),
 (57, 1),
 (58, 2),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 2),
 (67, 2),
 (68, 26),
 (69, 2),
 (70, 1),
 (71, 2),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 1),
 (76, 19),
 (77, 1),
 (78, 16),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 13),
 (85, 2),
 (86, 1),
 (87, 7),
 (88, 1),
 (89, 1),
 (90, 1),
 (91, 9),
 (92, 2),
 (93, 2),
 (94, 1),
 (95, 1),
 (96, 8),
 (97, 1),
 (98, 8),
 (99, 4),


In [47]:
ldamod2 = LdaModel(list(paper_corpus[0]), num_topics=num_topic, id2word = paper_dict, passes=20)

TypeError: 'int' object is not iterable

In [48]:
top_words = [[word for word,_ in ldamod.show_topic(topicno, topn=50)] for topicno in range(ldamod.num_topics)]
top_betas = [[beta for _,beta in ldamod.show_topic(topicno, topn=50)] for topicno in range(ldamod.num_topics)]
# print("Top Topics:")
# for topicno, words in enumerate(top_words):
#     print("%i: %s" % (topicno, ' '.join(words[:15])))
# print("\nTop Topic Betas:")
# for topicno, betas in enumerate(top_betas):
#     print("%i: %s" % (topicno, ' '.join(map(str,betas[:15]))))

In [49]:
print("Top Topics:")
for topicno, words in enumerate(top_words):
    print("%i: %s" % (topicno, ' '.join(words[:15])))
print("\nTop Topic Betas:")
for topicno, betas in enumerate(top_betas):
    print("%i: %s" % (topicno, ' '.join(map(str,betas[:15]))))

Top Topics:
0: 1 0 2 model p n x 3 j 4 e c 5 models f
1: 1 x 2 0 k data p 3 j model 4 5 n l e
2: 0 1 2 p 3 n 5 4 k x b c 6 data 10
3: 1 word 2 model 0 topic p words models l set c 3 4 j
4: 1 x 2 k 0 p f n q 3 r 10 4 g 5
5: 1 2 x n 0 h p v algorithm k r learning f 3 e
6: 1 2 x n 0 k algorithm 3 e b j r c f 4
7: learning 1 0 policy state 2 value model function reward action 3 r time reinforcement
8: 1 training image learning 2 model 3 0 4 images object 5 using networks network
9: 1 0 2 j network neural model time input 3 e 4 figure w 5

Top Topic Betas:
0: 0.022364337 0.018619636 0.010769323 0.0075186836 0.007084995 0.006425088 0.0062066936 0.006083487 0.0057909223 0.0055424618 0.0051271142 0.004925727 0.0046227216 0.00454257 0.0042009414
1: 0.023858158 0.01481365 0.013836226 0.012438043 0.010586638 0.010309945 0.009864036 0.006970543 0.0061266813 0.005941026 0.005736457 0.0055671697 0.005518928 0.0054501505 0.0053167064
2: 0.04650962 0.031987153 0.017859139 0.0111059705 0.008189089 0.00

## Save model

In [59]:
path = "model/ldamod1.gensim"
ldamod.save(path)

In [61]:
del ldamod

In [62]:
lda_2 = LdaModel.load("model/ldamod1.gensim")

In [64]:
top_words = [[word for word,_ in lda_2.show_topic(topicno, topn=50)] for topicno in range(lda_2.num_topics)]
top_betas = [[beta for _,beta in lda_2.show_topic(topicno, topn=50)] for topicno in range(lda_2.num_topics)]