## Topic Modeling 

In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt


## Lets see how the data looks like

In [2]:
filename = "sample-S2-records"
df = pd.read_json(filename, lines=True)
df.head()

Unnamed: 0,authors,doi,doiUrl,entities,id,inCitations,journalName,journalPages,journalVolume,outCitations,paperAbstract,pdfUrls,pmid,s2PdfUrl,s2Url,sources,title,venue,year
0,"[{'ids': ['38280253'], 'name': 'Kate Jack'}]",,,[Jack Device Component],f2320c08c7d95bbf8bb72e4d6deaa6845ea4cf27,[],Nursing times,26,109 49-50,[],,[],24568020v1,,https://semanticscholar.org/paper/f2320c08c7d9...,[Medline],60 seconds with Kate Jack.,Nursing times,2013.0
1,"[{'ids': ['5862934'], 'name': 'W N Spellacy'},...",,,"[Decision Making, Laboratory Certification Doc...",5432a99cdd9f8b248c50274cd3d2a6016f3d081e,[],The Journal of reproductive medicine,127-30,31 2,[],The search for new administrators in complex s...,[],3514907v1,,https://semanticscholar.org/paper/5432a99cdd9f...,[Medline],Organizing a search for an academic administra...,The Journal of reproductive medicine,1986.0
2,"[{'ids': ['39900230'], 'name': 'Stefanie Ernst...",,,"[Annexin A1, Annexins, Bacterial Infections, C...",155663331ea93379e99997bd43340eb54ab41a73,"[3738fad17126054f03cfe736b7156b6d6eef0481, 927...",Journal of immunology,7669-76,172 12,"[c2b53b26c004fe57e85424df6ad101d283150648, d30...",The human N-formyl peptide receptor (FPR) is a...,[http://www.jimmunol.org/content/jimmunol/172/...,15187149v1,http://pdfs.semanticscholar.org/cb73/147dc0bf1...,https://semanticscholar.org/paper/155663331ea9...,[Medline],An annexin 1 N-terminal peptide activates leuk...,Journal of immunology,2004.0
3,"[{'ids': ['1801874'], 'name': 'S Yamamoto'}, {...",,,"[Adrenal Cortex Hormones, Bladder Neoplasm, Ca...",b5a25960ebee9a6e5db79196e6b07f0edfcf5313,"[8bcedf8512f672310326a6cc0ec897939d28c6d1, 8b7...",Nihon Rinsho Men'eki Gakkai kaishi = Japanese ...,128-35,19 2,[],Serum CA 19-9 (2-3 sialyl Le(a)) is a marker o...,[],8705689v1,,https://semanticscholar.org/paper/b5a25960ebee...,[Medline],[Serum CA 19-9 levels in rheumatic diseases wi...,Nihon Rinsho Men'eki Gakkai kaishi = Japanese ...,1996.0
4,"[{'ids': ['14380299'], 'name': 'Edwards'}, {'i...",,,"[Cell Nucleus, Dependence, Nucleic Acids]",3b7538465b0559e2d3ff2b65991c8e399e457822,[],"Physical review. A, Atomic, molecular, and opt...",2709-2717,44 4,[],,[],9906253v1,,https://semanticscholar.org/paper/3b7538465b05...,[Medline],Sequence dependence of low-frequency Raman-act...,"Physical review. A, Atomic, molecular, and opt...",1991.0


### We only care about the abstract column. So extract that

In [3]:
abstract = df['paperAbstract']
abstract.replace('', np.NaN)


absDictionary = abstract.to_dict()
absData = []
for key in absDictionary:
    if absDictionary[key] =='':
        continue
    else:
        absData.append(absDictionary[key])

#print(len(absData))
absData

['The search for new administrators in complex systems is an important activity. The special requirements of academic organizations, particularly those with health centers, present some unique considerations that can confound this important and difficult process. Typically, national searches attract a sizable candidate list composed of persons with diverse backgrounds and experiences, and a committee is empowered to sort through their qualifications. A critical step in the planning of each search is the development of a process that allows participatory decision making while not requiring too much time. Too often the search becomes an unmanageable activity that confuses the searchers and frustrates the administration. A seven-step process has proven successful for use by committees to attract and sort through written candidate applications, to agree upon a preliminary ranking of candidates and to reach a consensus on a final list of recommendations. The process could be applied in almo

## Data Cleanup

In [4]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/nahalam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [7]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nahalam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [9]:
textTokens = []
for item in absData:
    tokens = prepare_text_for_lda(item)
    #print("item")
    #print(tokens)
    #convert the tokens from a list of strings to a string and then append it to textToken
    temp = " ".join(tokens)
    textTokens.append(temp)
textTokens


['search administrator complex system important activity special requirement academic organization particularly health center present unique consideration confound important difficult process typically national search attract sizable candidate compose person diverse background experience committee empower qualification critical planning search development process allow participatory decision making require often search become unmanageable activity confuse searcher frustrate administration seven process prove successful committee attract write candidate application agree preliminary ranking candidate reach consensus final recommendation process could apply almost organizational setting',
 'human formyl peptide receptor modulator chemotaxis direct granulocyte toward site bacterial infection founding member subfamily protein couple receptor thought function inflammatory process member fprl)1 fprl2 greatly reduce affinity bacterial peptide fprl2 consider orphan receptor study peptide deriv

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(textTokens)
tf_feature_names = tf_vectorizer.get_feature_names()

In [11]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 20
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)



## Evaluating Topics

In [12]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
factor disease explain consider primary 12 activating complex chronic platelet
Topic 1:
technique using reduce propose demonstrate paper network various algorithm problem
Topic 2:
metastasis liver hydrogen study glutathione patient synchronous fusion result relate
Topic 3:
knowledge ratio population expression academic sensor study trait design affective
Topic 4:
model lower conditions cation allow determine expression concentration alkaloid site
Topic 5:
relate marker tumor characteristic effort complex organization english observation structure
Topic 6:
services health possible regulation clinic establish environment magnetic classical require
Topic 7:
peptide bioassay activity result esophageal relate earlier specific obtain vary
Topic 8:
relate model oxygen tumor cell apoptosis treatment group esophageal change
Topic 9:
peptide member measure receptor conditions different bacterial family presence influence
Topic 10:
strongly consideration cytokine disorder complete local 