In [1]:
import sqlite3
import pandas as pd
from gensim import corpora, models, similarities
import nltk
from collections import Counter

In [2]:
# This work uses latent Dirichlet allocation to analyse journal articles from Nature.com
# Nature doesn't seem to have an API so the documents were scraped and stored in a SQLite database
# The sqlite database can be extracted from the zip folder called article_db
# The scripts used to scrape the data: collect-articles-html.py, process-scraped-html.py

conn = sqlite3.connect('./database/nature_articles.db')
cursor = conn.cursor()
num_articles = cursor.execute('SELECT count(distinct title) FROM articles WHERE wc > 1500;').fetchall()[0][0]
print('Number of unquie articles in dataset: ', num_articles)

df = pd.read_sql_query("SELECT distinct(title), text, url, journal, date FROM articles WHERE wc > 1500 ORDER BY random();",
                       conn)
df.head()

Number of unquie articles in dataset:  3147


Unnamed: 0,title,text,url,journal,date
0,Liquid-phase sequence capture and targeted re-...,tomato solanum lycopersicum l plants are chara...,http://www.nature.com/articles/s41598-017-06120-3,Scientific Reports,17 July 2017
1,The European Society of Human Genetics: beginn...,the european society of human genetics eshg wa...,http://www.nature.com/ejhg/journal/vaop/ncurre...,European Journal of Human Genetics,10 May 2017
2,Midnolin is a novel regulator of parkin expres...,midnolin midn was first discovered in embryoni...,http://www.nature.com/articles/s41598-017-05456-0,Scientific Reports,19 July 2017
3,Cardiac spheroids as promising models to stu...,three dimensional in vitro cell systems are a ...,http://www.nature.com/articles/s41598-017-06385-8,Scientific Reports,01 August 2017
4,Lower school performance in late chronotypes: ...,success at school determines future career opp...,http://www.nature.com/articles/s41598-017-04076-y,Scientific Reports,29 June 2017


In [4]:
# Retrieve one article in full
title, subject, article = cursor.execute("SELECT title, topic, text FROM articles ORDER BY random() LIMIT 1;").fetchall()[0]
print("\n", title)
print("\nSubject:", subject)
print("\n\t", article)


 A genomic perspective on stoichiometric regulation of soil carbon cycling

Subject: climate-sciences

	 similar to plant growth soil carbon c cycling is constrained by the availability of nitrogen n and phosphorus p we hypothesized that stoichiometric control over soil microbial c cycling may be shaped by functional guilds with distinct nutrient substrate preferences across a series of rice fields spanning soil c n p from to c turnover was best correlated with p availability and increased with experimental n addition only in lower c mineral soils with n p microbial community membership also varied with soil stoichiometry but not with n addition shotgun metagenome data revealed changes in community functions with increasing c turnover including a shift from aromatic c to carbohydrate utilization accompanied by lower n uptake and p scavenging similar patterns of c n and p acquisition along with higher ribosomal rna operon copy numbers distinguished that microbial taxa positively correl

In [5]:
subjects = cursor.execute("SELECT distinct topic FROM articles;").fetchall()
print("Subjects in dataset:\n")
for s in subjects:
    print('\t',s[0])

Subjects in dataset:

	 biotechnology
	 anatomy
	 anthropology
	 physics
	 psychology
	 mathematics-and-computing
	 computational-biology-and-bioinformatics
	 ecology
	 cell-biology
	 microbiology
	 biogeochemistry
	 zoology
	 climate-sciences
	 neuroscience
	 genetics
	 cancer
	 plant-sciences
	 immunology
	 chemical-biology
	 chemistry
	 evolution
	 stem-cells
	 ocean-sciences
	 diseases
	 molecular-medicine
	 engineering
	 materials-science
	 nanoscience-and-technology
	 drug-discovery
	 philosophy
	 business-and-industry
	 developmental-biology
