In [1]:
import sqlite3
import pandas as pd
from gensim import corpora, models, similarities
import nltk
from collections import Counter

In [2]:
# This work uses latent Dirichlet allocation to analyse journal articles from Nature.com
# Nature doesn't seem to have an API so the documents were scraped and stored in a SQLite database
# The sqlite database can be extracted from the zip folder called article_db
# The scripts used to scrape the data: collect-articles-html.py, process-scraped-html.py

conn = sqlite3.connect('./database/nature_articles.db')
cursor = conn.cursor()
num_articles = cursor.execute('SELECT count(distinct title) FROM articles WHERE wc > 1500;').fetchall()[0][0]
print('Number of unquie articles in dataset: ', num_articles)

df = pd.read_sql_query("SELECT distinct(title), text, url, journal, date FROM articles WHERE wc > 1500 ORDER BY random();",
                       conn)
df.head()

Number of unquie articles in dataset:  3147


Unnamed: 0,title,text,url,journal,date
0,Formulating formation mechanism of natural gas...,a large amount of energy perhaps twice the tot...,http://www.nature.com/articles/s41598-017-06717-8,Scientific Reports,25 July 2017
1,Endolysosomal targeting of a clinical chlorin ...,a major problem with many promising nano sized...,http://www.nature.com/articles/s41598-017-06109-y,Scientific Reports,20 July 2017
2,"The trophic role of a large marine predator, t...",tiger sharks were sampled off the western ning...,http://www.nature.com/articles/s41598-017-07751-2,Scientific Reports,09 August 2017
3,Projection of American dustiness in the late 2...,climate models project rising drought risks ov...,http://www.nature.com/articles/s41598-017-05431-9,Scientific Reports,17 July 2017
4,Programmable Deployment of Tensegrity Structur...,tensegrity structures with detachedstruts are ...,http://www.nature.com/articles/s41598-017-03412-6,Scientific Reports,14 June 2017


In [3]:
# Retrieve one article in full
title, subject, article = cursor.execute("SELECT title, topic, text FROM articles ORDER BY random() LIMIT 1;").fetchall()[0]
print("\n", title)
print("\nSubject:", subject)
print("\n\t", article)


 Global Trends and Regional Variations in Studies of HIV/AIDS

Subject: psychology

	 we conduct textual analysis of a sample of more than papers written on hiv aids during the past three decades using the latent dirichlet allocation method we disentangle studies that address behavioral and social aspects from other studies and measure the trends of different topics as related to hiv aids we show that there is a regional variation in scientists approach to the problem of hiv aids our results show that controlling for the economy proximity to the hiv aids problem correlates with the extent to which scientists look at the behavioral and social aspects of the disease rather than biomedical introduction since it was first detected in the early s aids has been studied by scientists around the world biomedical and epidemiological scientists have made substantial progress in understanding and controlling the disease over time in the early years hiv was discovered to be the cause of aids late

In [4]:
subjects = cursor.execute("SELECT distinct topic FROM articles;").fetchall()
print("Subjects in dataset:\n")
for s in subjects:
    print('\t',s[0])

Subjects in dataset:

	 biotechnology
	 anatomy
	 anthropology
	 physics
	 psychology
	 mathematics-and-computing
	 computational-biology-and-bioinformatics
	 ecology
	 cell-biology
	 microbiology
	 biogeochemistry
	 zoology
	 climate-sciences
	 neuroscience
	 genetics
	 cancer
	 plant-sciences
	 immunology
	 chemical-biology
	 chemistry
	 evolution
	 stem-cells
	 ocean-sciences
	 diseases
	 molecular-medicine
	 engineering
	 materials-science
	 nanoscience-and-technology
	 drug-discovery
	 philosophy
	 business-and-industry
	 developmental-biology


In [6]:
def render_topics(subjects, num_topics=3, stem=False, filter_n_most_common_words=500, num_words=30):
    if isinstance(subjects, str):
        df = pd.read_sql_query("SELECT distinct(title), text FROM articles WHERE wc > 1500 and topic = '{}';".format(subjects),
                               conn)
        
    
    else:
        df = pd.read_sql_query("SELECT distinct(title), text FROM articles WHERE wc > 1500 and topic IN {};".format(subjects),
                               conn)
    
    docs = df['text'].values
    split_docs = [doc.split(' ') for doc in docs]
    doc_words = [words for doc in split_docs for words in doc]
    wcount = Counter()
    wcount.update(doc_words)
    stopwords = nltk.corpus.stopwords.words('english') + ['introduction','conclusion'] # filter out terms used as section titles in most research papers
    for w, _ in wcount.most_common(filter_n_most_common_words):
        stopwords.append(w)
        
    if stem == True:
        docs = [stem_and_stopword_filter(doc, stopwords) for doc in docs]
    else:
        docs = [stopword_filter(doc, stopwords) for doc in docs]
    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    lda_model = models.LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics)
    topics = lda_model.show_topics(formatted=False, num_words=num_words)
    
    print(subjects)
    
    for t in range(len(topics)):
        print("\nTopic {}, top {} words:".format(t+1, num_words))
        print(" ".join([w[0] for w in topics[t][1]]))
        
    
        
        
def stem_and_stopword_filter(text, filter_list):
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    return [stemmer.stem(word) for word in text.split() if word not in filter_list and len(word) > 2]

def stopword_filter(text, filter_list):
    return [word for word in text.split() if word not in filter_list and len(word) > 2]

In [8]:
# specific subjects to analyze for topics as a tuple of strings
# ie subjects = ('philosophy', 'nanoscience-and-technology', 'biotechnology')
subjects = ('philosophy')

render_topics(subjects, num_topics=9, stem=False, filter_n_most_common_words=500)

philosophy

Topic 1, top 30 words:
environmental deliberation ssh particularly funding actors standard climate digital actual benefits types national seen kantian music models traditional literature instance projects ipcc past city perspective central external politics heidegger several

Topic 2, top 30 words:
ssh christ funding collaboration identity interior environment field attitudes projects theatre stakeholders taken published transcendence light disciplines suggest single never exist integration size major forces basic peace final main programmes

Topic 3, top 30 words:
interior ngs technologies ipcc seems children samples child christ national views hope minors empirical law procedures impact regard address peace described acts private deliberation light critique later greater essential existing

Topic 4, top 30 words:
pgt consumers communication mohr screening diseases immanence existing ukk theatre concerns internal appropriate technologies every directly taken sloterdijk ont