![](images/mk.png)

<h1><center>ONLINE SUPPLEMENT</center></h1>

John McLevey & Reid-McIlroy-Young. **Introducing *metaknowledge*: Software for Computational Research in Information Science, Science of Science, and Network Analysis.** *Journal of Informetrics*. XX(XX):XX-XX.

<h1><center>Part 3: Text Analysis</center></h1>

These supplementary notebooks were prepared by Dr. [John McLevey](http://www.johnmclevey.com/) (University of Waterloo) and [Reid McIlroy-Young](http://reidmcy.com/) (University of Chicago), and [Jillian Anderson](http://networkslab.org/) (NetLab, University of Waterloo). The code in this notebook is current as of *metaknowledge* version XXX.

In [24]:
import metaknowledge as mk
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
import seaborn as sns
import numpy
import matplotlib as plt
import pandas
import os

# Imports for gensim.
import gensim
from gensim import corpora, models

# Imports for pyLDAvis.
import pyLDAvis.gensim as gensimvis
import pyLDAvis

# Imports for sklearn.
from __future__ import print_function
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

sns.set_style(style="white") # change the default background plot colour
sns.set(font_scale=.7)

plt.rc("savefig", dpi=300) # improve default resolution of graphics

os.chdir('.')

In [25]:
RC = mk.RecordCollection('raw_data/imetrics/', cached = True)

# Topic Models


* Create topic models using the Gensim package or Scikit-Learn.
* Create interactive visualizations using Gensim models and pyLDAvis.

In [3]:
# Transform the record collection into a format for use with natural language processing applications.
raw = RC.forNLP('generated_datasets/topic_model/topic_model.csv', lower=True, removeNumbers=True,
         removeNonWords=True, removeWhitespace=True, extractCopyright=False)

In [27]:
# Conver the raw text into a list.
documents = raw['abstract']

## SKLearn

* Basic topic models using code from 
[this tutorial](http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html) written by Olivier Grisel, Lars Buitinck, and Chyi-Kwei Yau.

In [28]:
# For use with SKlearn, convert the raw text to a numpy array.
docs = numpy.asarray(documents)

In [53]:
# Increasing the number of features will give a better model, but it may increase the runtime.
features = 1000
topics = 10
top_words = 20

In [54]:
# Initialize the tokenizer.
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=features,
                                   stop_words='english')

In [55]:
# Tokenize the documents.
tfidf = tfidf_vectorizer.fit_transform(docs)

In [56]:
# Define the output function, code exactly as shown in tutorial.
def print_top_words(model, feature_names, top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-top_words - 1:-1]]))
    print()

In [57]:
# Extract the features (tokens) for the models.
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=features,
                                stop_words='english')

In [58]:
tf = tf_vectorizer.fit_transform(docs)

In [59]:
# Fit the Non-negative matrix factorization model.
nmf = NMF(n_components=topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)

In [60]:
# Print the list of topics and their contents. 
# Note that the constraints can be modified by changing the number of topics or number of words in each topic.
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, top_words)

Topic #0:
papers citation citations cited number authors published articles highly paper references author citing years counts received year publication average total
Topic #1:
information use behavior systems users retrieval seeking article study design social model library internet user science digital online health theory
Topic #2:
journals journal impact factor articles citation published factors jcr subject jif isi oa reports categories year citing sciences citations editorial
Topic #3:
patent patents technology technological innovation rd technologies firms patenting companies citations industry knowledge development applications analysis indicators nanotechnology paper value
Topic #4:
search web users query queries user engines results searching retrieval engine pages documents tasks terms sites interface task relevance searchers
Topic #5:
collaboration international collaborative coauthorship network networks countries collaborations chinese domestic impact coauthored instituti

In [66]:
# Extract topics for the LDA model, and fit the model.
lda = LatentDirichletAllocation(n_topics=topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [67]:
# Print the list of topics and their contents. 
# Note that the constraints can be modified by changing the number of topics or number of words in each topic.
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, top_words)

Topic #0:
data web patent analysis technology patents research technological study new results using model paper innovation used based information knowledge approach
Topic #1:
research countries collaboration scientific international science output national institutions universities china chinese production country paper publications funding performance university study
Topic #2:
authors number publications productivity researchers author publication scientists study results research total academic medical authorship university period differences scientific data
Topic #3:
information knowledge study use research social article behavior online systems work theory digital model design factors library internet findings understanding
Topic #4:
journals citation articles papers citations cited published journal impact number science publication research references years publications scientific article year citing
Topic #5:
citation index number hindex different model scientific paper distri


## Gensim

* For model coefficients and more analysis options.
* Code adapted from [the gensim tutorial]('https://radimrehurek.com/gensim/tutorial.html') by Radim Řehůřek.
* Visualizations created using pyLDAvis, using code from the [notebooks]('https://github.com/bmabey/pyLDAvis/tree/master/notebooks') by Ben Mabey.

In [68]:
# Create a list of stopwords, using the stopwords package.
stopwords = get_stop_words('en')

In [69]:
# Initialize the tokenizer.
tokenizer = RegexpTokenizer(r'\w+')

In [70]:
# Initialize a list, we will save tokens here.
tokens = []

In [71]:
# Iterate over the documents list, and tokenize and save each entry.
for l in documents:
    token = tokenizer.tokenize(l)
    tokens.append(token)

In [73]:
# Initialize a list, we will save cleaned tokens here.
cleaned_tokens = []

In [74]:
# Keep tokens only if they do not appear in the list of stopwords.
for l in tokens:
    cleaned_tokens.append([i for i in l if not i in stopwords])

In [75]:
# Create dictionary from the cleaned tokens.
dictionary = corpora.Dictionary(cleaned_tokens)

In [76]:
# Convert the cleaned tokens into a numpy array.
array = numpy.asarray(cleaned_tokens)

In [77]:
# Train the corpus using the array, creating a bag-of-words that contains each word in the array.
corpus = [dictionary.doc2bow(word) for word in array]

In [78]:
# Generate the LDA model using gensim.
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=50, id2word = dictionary, passes=20)

In [79]:
# The formot for printing 50 topics is not very good.
ldamodel.print_topics()

[(16, '0.092*"patent" + 0.076*"patents" + 0.022*"technological" + 0.021*"technology" + 0.019*"patenting" + 0.014*"data" + 0.014*"us" + 0.013*"study" + 0.012*"inventors" + 0.012*"academic"'), (25, '0.064*"time" + 0.030*"years" + 0.024*"downloads" + 0.022*"period" + 0.021*"periods" + 0.016*"data" + 0.016*"increase" + 0.015*"brazilian" + 0.014*"stress" + 0.014*"series"'), (10, '0.030*"sadc" + 0.025*"south" + 0.024*"africa" + 0.019*"african" + 0.018*"researchers" + 0.017*"collaborators" + 0.015*"world" + 0.014*"ranked" + 0.013*"consolidation" + 0.011*"highlycited"'), (48, '0.021*"information" + 0.019*"method" + 0.016*"results" + 0.016*"approaches" + 0.013*"approach" + 0.012*"methods" + 0.011*"new" + 0.011*"based" + 0.010*"retrieval" + 0.010*"using"'), (43, '0.027*"access" + 0.018*"scholarly" + 0.016*"research" + 0.015*"resources" + 0.015*"information" + 0.014*"communication" + 0.014*"work" + 0.012*"open" + 0.011*"internet" + 0.010*"article"'), (34, '0.094*"c" + 0.085*"elsevier" + 0.083*"ri

In [None]:
# You also have the option of saving the corpus, dictionary, and model.
MmCorpus.serialize('paper_abstracts.mm', corpus)
dictionary.save('paper_abstracts.dict')
ldamodel.save('paper_abstracts_lda.model')

### Using the model created with Gensim for an pyLDAvis interactive visualization

In [80]:
# Prepare the visualization data.
vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)

In [81]:
# Visualize the topic model.
pyLDAvis.display(vis_data)

# Burst Detection

In [None]:
bursts = RC.forBurst('keywords', 'generated_datasets/bursts.csv')
bursts_df = pandas.DataFrame.from_dict(bursts)
bursts_df[:10]