In [17]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords

n_features = 1000
n_topics = 15
n_top_words = 15

In [18]:
#Build the corpus (This cell only needed to be called once to make the .txt files)
'''
filenames = open('../Papers/15/pdflist.txt').read().splitlines()
for f in filenames:
    print(f)
    !pdftotext ../Papers/15/$f 
'''

"\nfilenames = open('../Papers/15/pdflist.txt').read().splitlines()\nfor f in filenames:\n    print(f)\n    !pdftotext ../Papers/15/$f \n"

In [19]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [20]:
# Strip common English words, words occurring in
# only one document or in at least 95%, as well as a list of other words
t0 = time()
docslist = []
filenames = open('../Papers/15/textlist.txt').read().splitlines()
for f in filenames:
    txtfile = open('../Papers/15/'+f).read()
    docslist.append(txtfile)
filenames = open('../Papers/05/textlist.txt').read().splitlines()
for f in filenames:
    txtfile = open('../Papers/05/'+f).read()
    docslist.append(txtfile)
filenames = open('../Papers/00/textlist.txt').read().splitlines()
for f in filenames:
    txtfile = open('../Papers/00/'+f).read()
    docslist.append(txtfile)
filenames = open('../Papers/95/textlist.txt').read().splitlines()
for f in filenames:
    txtfile = open('../Papers/95/'+f).read()
    docslist.append(txtfile)
filenames = open('../Papers/90/textlist.txt').read().splitlines()
for f in filenames:
    txtfile = open('../Papers/90/'+f).read()
    docslist.append(txtfile)
filenames = open('../Papers/79-82/textlist.txt').read().splitlines()
for f in filenames:
    txtfile = open('../Papers/79-82/'+f).read()
    docslist.append(txtfile)

print(len(docslist))
dataset = docslist
data_samples = dataset

sw = list(stopwords.words('english'))
ignore_words = ['annual','review','journal','may','anderson','reviews','ann','rev','rights','reserved','institution',
               'annu','fig','b','equation','equations','biological','biology','a','b','c','d','e','f','g','h','i','j',\
               'k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','use', 'uses','year','figure','studied','per',\
               'see','however','new','utc','http','https','et','al','one','two','study','studying','studies', 'also','used',\
               'thus','\u03c3','\u03c9','aug','thu','would','many','found','downloaded','press','university','within','de',\
               'could','should','results','result','effect','effects','total','given','using','table','likely','therefore',\
               'however','important','pp','among','ro','since','significant','first','\uxe4','must','well','levin','section',\
               'much','particular','second','first','show','based','part','biol','either','society','2001','2002','2003',
               '1999','2000','2018','org','jstor','10','165','daszak','di','1998','eids','nh','ctl','www','com','main,'\
               '1','2','3','4','5','130','17','203','75','18','cambridge','press','pdf','2006','1996','2005','contents',\
               '1979','1982','164','09','02','76','1990','200','08','48','58','42','41','27','21','23','1017','31','16',\
               '36','46','35','vol','pg','1981','1985','1988','47','1980','oxford',
               '2012','2011','2010','2009','2008','2007','2006','2005','2004','2003','2002','2001','2000','1979','1980',\
               '1981','1982','1983','1984','1985','1986','1987','1988','1989','1990','1991','1992','1993','1994','1995',\
               '1978','1996','1997','1998','1999']
sw_all = sw+ignore_words

330


In [21]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.98, min_df=5,
                                   max_features=n_features,
                                   stop_words=sw_all)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with tf-idf features,")
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Extracting tf-idf features for NMF...
done in 2.612s.
Fitting the NMF model with tf-idf features,
done in 0.379s.

Topics in NMF model:
Topic #0:
parasite host parasites hosts infection population species density infected core parasitology daphnia fecundity growth life
Topic #1:
equilibrium model population r0 models disease rate dt theorem endemic math time stable system epidemic
Topic #2:
disease population species infection populations diseases wildlife human animals health infectious transmission host pathogen control
Topic #3:
s2 main class six five four three energy ds differential eq co adult 13 stochastic
Topic #4:
virulence strain transmission evolution strains virulent hosts host vertical rate pathogen infected horizontal mortality optimal
Topic #5:
larvae transmission density larval virus infected densities insect experiment host rate pathogen action dynamics hosts
Topic #6:
virus viruses viral rabbits rabbit cells cell strains host gene strain human hiv load immune
Topic #7