In [1]:
import sqlite3
import pandas as pd
from gensim import corpora, models, similarities
import nltk
from collections import Counter

In [2]:
# This work uses latent Dirichlet allocation to analyse journal articles from Nature.com
# Nature doesn't seem to have an API so the documents were scraped and stored in a SQLite database
# The sqlite database can be extracted from the zip folder called article_db
# The scripts used to scrape the data: collect-articles-html.py, process-scraped-html.py

conn = sqlite3.connect('./database/nature_articles.db')
cursor = conn.cursor()
num_articles = cursor.execute('SELECT count(distinct title) FROM articles WHERE wc > 1500;').fetchall()[0][0]
print('Number of unquie articles in dataset: ', num_articles)

df = pd.read_sql_query("SELECT distinct(title), text, url, journal, date FROM articles WHERE wc > 1500 ORDER BY random();",
                       conn)
df.head()

Number of unquie articles in dataset:  3147


Unnamed: 0,title,text,url,journal,date
0,Membrane Targeting of Disheveled Can Bypass th...,the highly conserved wnt signaling pathway reg...,http://www.nature.com/articles/s41598-017-04414-0,Scientific Reports,31 July 2017
1,Early skeletal muscle loss during target thera...,skeletal muscle depletion is common in patient...,http://www.nature.com/articles/s41598-017-07955-6,Scientific Reports,08 August 2017
2,Predicting Magnetostimulation Thresholds in th...,rapid switching of applied magnetic fields in ...,http://www.nature.com/articles/s41598-017-05493-9,Scientific Reports,13 July 2017
3,An outbreak of severe infections among Austral...,human parechovirus types hpev are positive str...,http://www.nature.com/srep/2017/170314/srep444...,Scientific Reports,14 March 2017
4,Asymptomatic Transmission and the Dynamics of ...,following the outbreak in french polynesia the...,http://www.nature.com/articles/s41598-017-05013-9,Scientific Reports,19 July 2017


In [3]:
# Retrieve one article in full
title, subject, article = cursor.execute("SELECT title, topic, text FROM articles ORDER BY random() LIMIT 1;").fetchall()[0]
print("\n", title)
print("\nSubject:", subject)
print("\n\t", article)


 Generation of   Gene-Edited Channel Catfish ( ) via Zygote Injection of CRISPR/Cas9 System

Subject: biotechnology

	 the myostatin mstn gene is important because of its role in regulation of skeletal muscle growth in all vertebrates in this study crispr cas was utilized to successfully target the channel catfish ictalurus punctatus muscle suppressor gene mstn crispr cas induced high rates of mutagenesis in the target protein encoding sites of mstn mstn edited fry had more muscle cells p than controls and the mean body weight of gene edited fry increased by the nucleic acid alignment of the mutated sequences against the wild type sequence revealed multiple insertions and deletions these results demonstrate that crispr cas is a highly efficient tool for editing the channel catfish genome and opens ways for facilitating channel catfish genetic enhancement and functional genomics this approach may produce growth enhanced channel catfish and increase productivity introduction although fi

In [4]:
subjects = cursor.execute("SELECT distinct topic FROM articles;").fetchall()
print("Subjects in dataset:\n")
for s in subjects:
    print('\t',s[0])

Subjects in dataset:

	 biotechnology
	 anatomy
	 anthropology
	 physics
	 psychology
	 mathematics-and-computing
	 computational-biology-and-bioinformatics
	 ecology
	 cell-biology
	 microbiology
	 biogeochemistry
	 zoology
	 climate-sciences
	 neuroscience
	 genetics
	 cancer
	 plant-sciences
	 immunology
	 chemical-biology
	 chemistry
	 evolution
	 stem-cells
	 ocean-sciences
	 diseases
	 molecular-medicine
	 engineering
	 materials-science
	 nanoscience-and-technology
	 drug-discovery
	 philosophy
	 business-and-industry
	 developmental-biology


In [5]:
def render_topics(subjects, num_topics=3, stem=False, filter_n_most_common_words=500, num_words=30):
    if isinstance(subjects, str):
        df = pd.read_sql_query("SELECT distinct(title), text FROM articles WHERE wc > 1500 and topic = '{}';".format(subjects),
                               conn)
        
    
    else:
        df = pd.read_sql_query("SELECT distinct(title), text FROM articles WHERE wc > 1500 and topic IN {};".format(subjects),
                               conn)
    
    docs = df['text'].values
    split_docs = [doc.split(' ') for doc in docs]
    doc_words = [words for doc in split_docs for words in doc]
    wcount = Counter()
    wcount.update(doc_words)
    stopwords = nltk.corpus.stopwords.words('english') + ['introduction','conclusion'] # filter out terms used as section titles in most research papers
    for w, _ in wcount.most_common(filter_n_most_common_words):
        stopwords.append(w)
        
    if stem == True:
        docs = [stem_and_stopword_filter(doc, stopwords) for doc in docs]
    else:
        docs = [stopword_filter(doc, stopwords) for doc in docs]
    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    lda_model = models.LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics)
    topics = lda_model.show_topics(formatted=False, num_words=num_words)
    
    print(subjects)
    
    for t in range(len(topics)):
        print("\nTopic {}, top {} words:".format(t+1, num_words))
        print(" ".join([w[0] for w in topics[t][1]]))
        
    
        
        
def stem_and_stopword_filter(text, filter_list):
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    return [stemmer.stem(word) for word in text.split() if word not in filter_list and len(word) > 2]

def stopword_filter(text, filter_list):
    return [word for word in text.split() if word not in filter_list and len(word) > 2]

In [6]:
# specific subjects to analyze for topics as a tuple of strings
# ie subjects = ('philosophy', 'nanoscience-and-technology', 'biotechnology')
subjects = ('philosophy')

render_topics(subjects, num_topics=9, stem=False, filter_n_most_common_words=500)

philosophy

Topic 1, top 30 words:
pgt consumers immanence climate latour theatre aim www actual transcendence cda diseases perspective object internal providers material professional traditional understood rational due offer law direct never particularly contemporary essential importance

Topic 2, top 30 words:
minors theatre ngs parents deliberation essential climate technology address technologies reasons children external national child law acts central situation impact size interior action countries described structures developed economic particularly whose

Topic 3, top 30 words:
ifs diseases samples stakeholder projects pgt central ukk models benefits consumers professional greed heidegger core allow genotype hope participation third procedures peace appropriate existing established involves http technologies kantian require

Topic 4, top 30 words:
taken laws computing acts later csp technology far law provides interior children ngs projects christ http event theatre perspective

In [8]:
render_topics(('mathematics-and-computing','computational-biology-and-bioinformatics','nanoscience-and-technology'),
               num_topics=9, stem=False, filter_n_most_common_words=500)

('mathematics-and-computing', 'computational-biology-and-bioinformatics', 'nanoscience-and-technology')

Topic 1, top 30 words:
fractal scaling film problem variables snps mirna thermal equivalent self weight output transmission applications furthermore devices risk stress infection infected beam optimization identify overall diseases relatively mechanical critical solutions indicate

Topic 2, top 30 words:
pressure theta food pairs common insulin diseases reaction particles threshold directly dynamic particle cycle achieved indicate whether graphene overall devices stress mir hand connectivity rna impact statistical electric stability strategy

Topic 3, top 30 words:
edges community voltage connectivity likely detected part weighted sleep profiles cycle dynamic term way hand mechanical consistent global tested scaling quality dependence scattering zero computational condition dimensional mice find details

Topic 4, top 30 words:
sleep connectivity sequencing resonance fields dynamic c

In [None]:
subjects = cursor.execute('SELECT distinct topic FROM articles;').fetchall()

for s in subjects:
    render_topics(s[0], num_topics=9, stem=False, filter_n_most_common_words=500) 
    print('==================================================================================================')

biotechnology

Topic 1, top 30 words:
hscgn ago arg terminal wheat gaba mesh extraction ngo domain accumulation dms form taarg synthesis indicates infected phenamil hydrogel signaling scgn bar mir stable mirna insulin carbon enhanced gut fatty

Topic 2, top 30 words:
slt pcz salmonella antigen copulation success strain typhimurium strains flies pqk salt heterologous primers recombinant tcp male induction major cluster impact sgp repeats sea exposure silver lps males grown plasmid

Topic 3, top 30 words:
wound hydrogel silk mesh puma dga dms correlation needle prior agnps ecm infected crispr reduction breast mrna endothelial components amount impact csps imaging hydrogels targeted cpgs variation patterns transcriptome measurements

Topic 4, top 30 words:
zdiii cold phyb differentially regulation ago phya hbcag mesh wheat gfp produced salt eta cpg breast determine mir antigen measurements end shell indicates reduction regulatory seq compounds viral ssp plate

Topic 5, top 30 words:
subst

psychology

Topic 1, top 30 words:
depth anger plane punishment disparity violent coherence distraction engagement implicit posterior rumination points front back people functions applied stimulation investigate means games red input intervention vwm sizes psychological affect report

Topic 2, top 30 words:
orientation observers school videos amplitudes computed fit volume states rest symptoms identified feedback intervention hypnosis valence electrodes least involved sucrose anxiety dependent rpe viewing near treatment demonstrated dependence many people

Topic 3, top 30 words:
videos methylation vstm engagement anxiety video oxtr significance infants amplitudes amygdala schizophrenia females locations trait examined events volume sizes investigated turning patterns indicating load feedback durations childhood shape underlying health

Topic 4, top 30 words:
rake rats csf tool gender schizophrenia engagement old tone log disorder tactile common pair reading males towards partner violen