## Pub Med Scraper

In [1]:
# scrape dependencies
import requests
import re
from bs4 import BeautifulSoup as bs

# data analysis dependencies
import pandas as pd
import numpy as np
import csv

# ipynb dependencies
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

# viz dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import datetime as dt
import time

In [2]:
# set the url to scrape
url = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease'
print(url)

https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease


In [3]:
# set up beautiful soup to scrape
response = requests.get(url)
soup = bs(response.text, 'html.parser')

In [4]:
# lets scrape the article titles
journals = soup.find_all("p", attrs={'class':'title'})

In [5]:
# searching for the journal titles
journals_len = len(journals)
print(f"There are {journals_len} journals to scrape on the first page.")

There are 20 journals to scrape on the first page.


In [6]:
# loop through journals to print titles
for i in range(0,20):
    journals[i].text.strip()

'[Parkinson Disease and Pain - diagnostic and therapeutic approaches to a challenging non-motor symptom].'

"Assessing the validity of proxy caregiver reporting for potential palliative care outcome measures in Parkinson's disease."

"Resting-state connectivity after visuo-motor skill learning is inversely associated with offline consolidation in Parkinson's disease and healthy controls."

"Effects of blackberry (Morus nigra) fruit juice on levodopa-induced dyskinesia in a mice model of Parkinson's disease."

'Testing the Protein Propagation Hypothesis of Parkinson Disease.'

'Understanding the Care Needs and Profile of People Living at Home With Moderate to Advanced Stage Parkinson Disease.'

'A chemical probe to monitor the parkinsonism-associated protein DJ-1 in live cells.'

"Recruitment and Retention in Clinical Trials of Deep Brain Stimulation in Early-Stage Parkinson's Disease: Past Experiences and Future Considerations."

'Evolvability and Neurodegenerative Disease: Antagonistic Pleiotropy Phenomena Derived from Amyloid Aggregates.'

'Aging modulates microglia phenotypes in neuroinflammation of MPTP-PD mice.'

'The Sinister Face of Heme Oxygenase-1 in Brain Aging and Disease.'

'In vivo Quantification of Glial Activation in Minipigs Overexpressing human α-synuclein.'

"The REM Sleep Behavior Disorder Screening Questionnaire is not Valid in De Novo Parkinson's Disease."

'Immunosuppressants and risk of Parkinson disease.'

"Do We Need to Rethink the Epidemiology and Healthcare Utilization of Parkinson's Disease in Germany?"

"Hypermetabolism in the cerebellum and brainstem and cortical hypometabolism are independently associated with cognitive impairment in Parkinson's disease."

'Sensorimotor integration training in Parkinson`s disease.'

'Professional occupation and the risk of Parkinson Disease.'

'Target-specific forebrain projections and appropriate synaptic inputs of hESC-derived dopamine neurons grafted to the midbrain of parkinsonian rats.'

'Apomorphine and levodopa infusion for motor fluctuations and dyskinesia in advanced Parkinson disease.'

## Set main url to concat with pubmed ids

In [7]:
# set the main url that we will concatanate with the pubmed id
main_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
print(main_url)

https://www.ncbi.nlm.nih.gov/pubmed/


<br>
<br>
# TESTING SECTION ------------------------------------------------------------------

In [8]:
# set empty links_all list to append to 
links_all = []

# set pubmed ids list to append to
pubmed_ids = []

# set empty list to append scrape_links to
scrape_links = []

In [9]:
# use bs to scarpe p tags with class - title
links = soup.find_all("p",attrs={'class':'title'})
print(links[0])

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/30016807" ref="ordinalpos=1&amp;ncbi_uid=30016807&amp;link_uid=30016807&amp;linksrc=docsum_title">[<b>Parkinson Disease</b> and Pain - diagnostic and therapeutic approaches to a challenging non-motor symptom].</a></p>


In [10]:
# testing to see how my links / journals to scrape
articles_to_scrape = len(links)
print(f"There are {articles_to_scrape} articles to scrape.")
print("----------------------------------------------")

There are 20 articles to scrape.
----------------------------------------------


In [11]:
# loop through links to convert to string
for i in range (len(links)):
    links_all.append(str(links[i]))
    print(links[i])
    print("----------------------------------------------")

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/30016807" ref="ordinalpos=1&amp;ncbi_uid=30016807&amp;link_uid=30016807&amp;linksrc=docsum_title">[<b>Parkinson Disease</b> and Pain - diagnostic and therapeutic approaches to a challenging non-motor symptom].</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/30015552" ref="ordinalpos=2&amp;ncbi_uid=30015552&amp;link_uid=30015552&amp;linksrc=docsum_title">Assessing the validity of proxy caregiver reporting for potential palliative care outcome measures in <b>Parkinson</b>'s <b>disease</b>.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/30015056" ref="ordinalpos=3&amp;ncbi_uid=30015056&amp;link_uid=30015056&amp;linksrc=docsum_title">Resting-state connectivity after visuo-motor skill learning is inversely associated with offline consoli

In [12]:
# for i in links_all:
#     print(i)
#     print("----------------------------------------------")

In [13]:
# slice through links_all to test
len(links_all)
links_all[5]

20

'<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/30012028" ref="ordinalpos=6&amp;ncbi_uid=30012028&amp;link_uid=30012028&amp;linksrc=docsum_title">Understanding the Care Needs and Profile of People Living at Home With Moderate to Advanced Stage <b>Parkinson Disease</b>.</a></p>'

### Regex test

In [14]:
regex_test = '<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29987868" ref="ordinalpos=6&amp;ncbi_uid=29987868&amp;link_uid=29987868&amp;linksrc=docsum_title">Personal computer-based cognitive training in <b>Parkinson</b>\'s <b>disease</b>: a case study.</a></p>'

In [15]:
print(regex_test)

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29987868" ref="ordinalpos=6&amp;ncbi_uid=29987868&amp;link_uid=29987868&amp;linksrc=docsum_title">Personal computer-based cognitive training in <b>Parkinson</b>'s <b>disease</b>: a case study.</a></p>


In [16]:
re.findall(r'\d{8}',regex_test)

['29987868', '29987868', '29987868']

### End Regex Test

In [17]:
# loop through links all and use regex to grab the id numbers
for i in range (len(links_all)):
    pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))

In [18]:
# print out info for pubmed_ids
len(pubmed_ids)
type(pubmed_ids)
print(pubmed_ids)
print("----------------------------------------------")

20

list

[['30016807', '30016807', '30016807'], ['30015552', '30015552', '30015552'], ['30015056', '30015056', '30015056'], ['30013404', '30013404', '30013404'], ['30013389', '30013389', '30013389'], ['30012028', '30012028', '30012028'], ['30011180', '30011180', '30011180'], ['30010145', '30010145', '30010145'], ['30010144', '30010144', '30010144'], ['30009921', '30009921', '30009921'], ['30009872', '30009872', '30009872'], ['30009467', '30009467', '30009467'], ['30009211', '30009211', '30009211'], ['30009205', '30009205', '30009205'], ['30008693', '30008693', '30008693'], ['30008111', '30008111', '30008111'], ['30007996', '30007996', '30007996'], ['30007105', '30007105', '30007105'], ['30007046', '30007046', '30007046'], ['30006821', '30006821', '30006821']]
----------------------------------------------


In [19]:
# use itertools to transform pubmed ids from an array withn an array into one list
import itertools
pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))

In [20]:
# slice through pubmed_merged to see what itertools did
pubmed_merged[0]
print(pubmed_merged)

'30016807'

['30016807', '30016807', '30016807', '30015552', '30015552', '30015552', '30015056', '30015056', '30015056', '30013404', '30013404', '30013404', '30013389', '30013389', '30013389', '30012028', '30012028', '30012028', '30011180', '30011180', '30011180', '30010145', '30010145', '30010145', '30010144', '30010144', '30010144', '30009921', '30009921', '30009921', '30009872', '30009872', '30009872', '30009467', '30009467', '30009467', '30009211', '30009211', '30009211', '30009205', '30009205', '30009205', '30008693', '30008693', '30008693', '30008111', '30008111', '30008111', '30007996', '30007996', '30007996', '30007105', '30007105', '30007105', '30007046', '30007046', '30007046', '30006821', '30006821', '30006821']


In [21]:
# concat main_url with a slice of pubmed_merged before we loop
print(main_url + str(pubmed_merged[0]))

https://www.ncbi.nlm.nih.gov/pubmed/30016807


In [22]:
# append merged links to links_all
for i in range (len(pubmed_merged)):
    scrape_links.append(main_url + str(pubmed_merged[i]))

In [23]:
for i in scrape_links:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/30016807
https://www.ncbi.nlm.nih.gov/pubmed/30016807
https://www.ncbi.nlm.nih.gov/pubmed/30016807
https://www.ncbi.nlm.nih.gov/pubmed/30015552
https://www.ncbi.nlm.nih.gov/pubmed/30015552
https://www.ncbi.nlm.nih.gov/pubmed/30015552
https://www.ncbi.nlm.nih.gov/pubmed/30015056
https://www.ncbi.nlm.nih.gov/pubmed/30015056
https://www.ncbi.nlm.nih.gov/pubmed/30015056
https://www.ncbi.nlm.nih.gov/pubmed/30013404
https://www.ncbi.nlm.nih.gov/pubmed/30013404
https://www.ncbi.nlm.nih.gov/pubmed/30013404
https://www.ncbi.nlm.nih.gov/pubmed/30013389
https://www.ncbi.nlm.nih.gov/pubmed/30013389
https://www.ncbi.nlm.nih.gov/pubmed/30013389
https://www.ncbi.nlm.nih.gov/pubmed/30012028
https://www.ncbi.nlm.nih.gov/pubmed/30012028
https://www.ncbi.nlm.nih.gov/pubmed/30012028
https://www.ncbi.nlm.nih.gov/pubmed/30011180
https://www.ncbi.nlm.nih.gov/pubmed/30011180
https://www.ncbi.nlm.nih.gov/pubmed/30011180
https://www.ncbi.nlm.nih.gov/pubmed/30010145
https://ww

# END TESTING SECTION------------------------------------------------------------
<br>
<br>

## Function to create array of links to scrape

In [24]:
# set empty links_all list to append to 
links_all = []

# set pubmed ids list to append to
pubmed_ids = []

# set empty list to append scrape_links to
scrape_links = []

# function to get links
def get_links(main_url):
    
    # use bs to scarpe p tags with class - title
    links = soup.find_all("p",attrs={'class':'title'})
      
    # testing to see how my links / journals to scrape
    articles_to_scrape = len(links)
    print(f"There are {articles_to_scrape} articles to scrape.")
    print("----------------------------------------------")
    
    # loop through links to convert to string
    for i in range (len(links)):
        links_all.append(str(links[i]))
        print(links[i])
        print("----------------------------------------------")
        
    # slice through links_all to test
    len(links_all)
    links_all[1]
    
    # loop through links all and use regex to grab the id numbers
    for i in range (len(links_all)):
        pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))
    
    # print out info for pubmed_ids
    len(pubmed_ids)
    type(pubmed_ids)
    print(pubmed_ids)
    print("----------------------------------------------")
    
    # use itertools to transform pubmed ids from an array withn an array into one list
    import itertools
    pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))
    
    # slice through pubmed_merged to see what itertools did
    pubmed_merged[0]
    
    # concat main_url with a slice of pubmed_merged before we loop
    print(main_url + str(pubmed_merged[0]))
    
    # append merged links to links_all
    for i in range (len(pubmed_merged)):
        scrape_links.append(main_url + str(pubmed_merged[i]))

In [25]:
# RUN FUNCTION
get_links(main_url)

There are 20 articles to scrape.
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/30016807" ref="ordinalpos=1&amp;ncbi_uid=30016807&amp;link_uid=30016807&amp;linksrc=docsum_title">[<b>Parkinson Disease</b> and Pain - diagnostic and therapeutic approaches to a challenging non-motor symptom].</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/30015552" ref="ordinalpos=2&amp;ncbi_uid=30015552&amp;link_uid=30015552&amp;linksrc=docsum_title">Assessing the validity of proxy caregiver reporting for potential palliative care outcome measures in <b>Parkinson</b>'s <b>disease</b>.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/30015056" ref="ordinalpos=3&amp;ncbi_uid=30015056&amp;link_uid=30015056&amp;linksrc=docsum_title">Resting-state connectivi

<br>
There are duplicates in our **scrape_links** array. Use `list` to delete the duplicates.

In [26]:
# delete duplicates in scrape_links and assign to new variable scrape_links_final
scrape_links_final = list(set(scrape_links))
len(scrape_links_final)
scrape_links_final

20

['https://www.ncbi.nlm.nih.gov/pubmed/30008693',
 'https://www.ncbi.nlm.nih.gov/pubmed/30016807',
 'https://www.ncbi.nlm.nih.gov/pubmed/30013404',
 'https://www.ncbi.nlm.nih.gov/pubmed/30007105',
 'https://www.ncbi.nlm.nih.gov/pubmed/30015552',
 'https://www.ncbi.nlm.nih.gov/pubmed/30009467',
 'https://www.ncbi.nlm.nih.gov/pubmed/30006821',
 'https://www.ncbi.nlm.nih.gov/pubmed/30009921',
 'https://www.ncbi.nlm.nih.gov/pubmed/30007996',
 'https://www.ncbi.nlm.nih.gov/pubmed/30010144',
 'https://www.ncbi.nlm.nih.gov/pubmed/30009205',
 'https://www.ncbi.nlm.nih.gov/pubmed/30007046',
 'https://www.ncbi.nlm.nih.gov/pubmed/30012028',
 'https://www.ncbi.nlm.nih.gov/pubmed/30008111',
 'https://www.ncbi.nlm.nih.gov/pubmed/30013389',
 'https://www.ncbi.nlm.nih.gov/pubmed/30010145',
 'https://www.ncbi.nlm.nih.gov/pubmed/30011180',
 'https://www.ncbi.nlm.nih.gov/pubmed/30009211',
 'https://www.ncbi.nlm.nih.gov/pubmed/30015056',
 'https://www.ncbi.nlm.nih.gov/pubmed/30009872']

## Main array of links to scrape:

Here we use selenium to iterate through these links. Seleium will click on each link then scrape the title and abstracts on each page. 

In [27]:
# testing scrape_links
for i in scrape_links_final:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/30008693
https://www.ncbi.nlm.nih.gov/pubmed/30016807
https://www.ncbi.nlm.nih.gov/pubmed/30013404
https://www.ncbi.nlm.nih.gov/pubmed/30007105
https://www.ncbi.nlm.nih.gov/pubmed/30015552
https://www.ncbi.nlm.nih.gov/pubmed/30009467
https://www.ncbi.nlm.nih.gov/pubmed/30006821
https://www.ncbi.nlm.nih.gov/pubmed/30009921
https://www.ncbi.nlm.nih.gov/pubmed/30007996
https://www.ncbi.nlm.nih.gov/pubmed/30010144
https://www.ncbi.nlm.nih.gov/pubmed/30009205
https://www.ncbi.nlm.nih.gov/pubmed/30007046
https://www.ncbi.nlm.nih.gov/pubmed/30012028
https://www.ncbi.nlm.nih.gov/pubmed/30008111
https://www.ncbi.nlm.nih.gov/pubmed/30013389
https://www.ncbi.nlm.nih.gov/pubmed/30010145
https://www.ncbi.nlm.nih.gov/pubmed/30011180
https://www.ncbi.nlm.nih.gov/pubmed/30009211
https://www.ncbi.nlm.nih.gov/pubmed/30015056
https://www.ncbi.nlm.nih.gov/pubmed/30009872


In [28]:
# slice out scrape_links_final so we can scrape 5 articles at a time
links_1 = scrape_links_final[0:5]
links_1

links_2 = scrape_links_final[5:10]
links_2

links_3 = scrape_links_final[10:15]
links_3

links_4 = scrape_links_final[15:20]
links_4

['https://www.ncbi.nlm.nih.gov/pubmed/30008693',
 'https://www.ncbi.nlm.nih.gov/pubmed/30016807',
 'https://www.ncbi.nlm.nih.gov/pubmed/30013404',
 'https://www.ncbi.nlm.nih.gov/pubmed/30007105',
 'https://www.ncbi.nlm.nih.gov/pubmed/30015552']

['https://www.ncbi.nlm.nih.gov/pubmed/30009467',
 'https://www.ncbi.nlm.nih.gov/pubmed/30006821',
 'https://www.ncbi.nlm.nih.gov/pubmed/30009921',
 'https://www.ncbi.nlm.nih.gov/pubmed/30007996',
 'https://www.ncbi.nlm.nih.gov/pubmed/30010144']

['https://www.ncbi.nlm.nih.gov/pubmed/30009205',
 'https://www.ncbi.nlm.nih.gov/pubmed/30007046',
 'https://www.ncbi.nlm.nih.gov/pubmed/30012028',
 'https://www.ncbi.nlm.nih.gov/pubmed/30008111',
 'https://www.ncbi.nlm.nih.gov/pubmed/30013389']

['https://www.ncbi.nlm.nih.gov/pubmed/30010145',
 'https://www.ncbi.nlm.nih.gov/pubmed/30011180',
 'https://www.ncbi.nlm.nih.gov/pubmed/30009211',
 'https://www.ncbi.nlm.nih.gov/pubmed/30015056',
 'https://www.ncbi.nlm.nih.gov/pubmed/30009872']

## Regex Notes

In [29]:
# Regex
# Identifiers:
# \d any number
# \D anything but a number
# \s space
# \S anything but a space
# \w any character
# \W anything but a character
# . any character, except for a newline
# \b the whitespace around words
# \. a period

# Modifiers:
# {1,3} we're expecting 1-3 \d{1-3}
# + Match 1 or more
# ? Match 0 or more
# * Match 0 or more
# $ Match the end of a string
# ^ matching the beginning of a string
# | either or
# [] range or "variance" [A-Za-z] [1-5a-qA-Z]
# {x} expecting "x" amount

# White Space Characters: 
# \n new line
# \s space
# \t tab
# \e escape
# \f form feed
# \r return

# DONT FORGET!:
# . + * [] $ ^ () {} | \

## Selenium
**Web Browser Automation**

In [30]:
from splinter import Browser
from selenium import webdriver

In [31]:
# make sure chrome browser exe is in current directory
# chrome browser exe is not necessary for MACS
executable_path = {'executable_path': 'chromedriver'}

## Set up dictionary to append data to

In [32]:
article_dict = {}

## Set up SQLite

In [33]:
# Import sqlalchemy Dependencies
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy import func
from sqlalchemy import Column, Float, Integer, String, Date
from sqlalchemy.ext.declarative import declarative_base

In [34]:
# Create an engine for the photopharm.sqlite database
engine = create_engine("sqlite:///photopharm.sqlite")


In [35]:
# Reflect Database into ORM classes
Base = declarative_base()

In [36]:
# Create ORM class for titles and abstracts
class Articles(Base):
    __tablename__ = "Title_Abstracts"
    
    __table_args__ = {'extend_existing': True} 
    
    id = Column(Integer, primary_key = True)
    title = Column(String)
    abstract = Column(String)
    

In [37]:
# create tables
Base.metadata.create_all(engine)

## Create get_article_info function

In [None]:
import json

In [38]:
title = []
abstract = []

def get_article_info(links_1):
    
    # iterate through articles
    for i in links_1:
        
        # sets up scraper
        browser = Browser('chrome', headless=False)
        html = browser.html
        response2 = requests.get(i)
        soup2 = bs(response2.text, 'html.parser')
    
        browser.visit(i)
    
        # there are two 'h1' tags on this page. slice out index 0
        title_one = soup2.find_all('h1')
        article_one_title = title_one[1].text.strip()
    
        # slice h1 at index 1 to grab article title
        title.append(article_one_title)
    
        # get abstract 
        # abstract.append(soup2.find("div", attrs={'class': 'rprt_all'}).text.strip())     
        abstract.append(soup2.find("div", attrs={'class': 'abstr'}).text.strip())
        
        article_dict["title"] = title
        article_dict["abstract"] = abstract

## Run article_info function

In [39]:
get_article_info(links_1)

In [None]:
get_article_info(links_2)

In [None]:
get_article_info(links_3)

In [None]:
get_article_info(links_4)

In [40]:
for i in title:
    print(i + "\n")

len(title)

Do We Need to Rethink the Epidemiology and Healthcare Utilization of Parkinson's Disease in Germany?

[Parkinson Disease and Pain - diagnostic and therapeutic approaches to a challenging non-motor symptom].

Effects of blackberry (Morus nigra) fruit juice on levodopa-induced dyskinesia in a mice model of Parkinson's disease.

Professional occupation and the risk of Parkinson Disease.

Assessing the validity of proxy caregiver reporting for potential palliative care outcome measures in Parkinson's disease.



5

In [41]:
for i in abstract:
    print(i)
    print("\n")

len(abstract)

AbstractEpidemiological aspects of Parkinson's disease (PD), co-occurring diseases and medical healthcare utilization of PD patients are still largely elusive. Based on claims data of 3.7 million statutory insurance members in Germany in 2015 the prevalence and incidence of PD was determined. PD cases had at least one main hospital discharge diagnosis of PD, or one physician diagnosis confirmed by a subsequent or independent diagnosis or by PD medication in 2015. Prevalence of (co-)occurring diseases, mortality, and healthcare measures in PD cases and matched controls were compared. In 2015, 21,714 prevalent PD cases (standardized prevalence: 511.4/100,000 persons) and 3,541 incident PD cases (standardized incidence: 84.1/100,000 persons) were identified. Prevalence of several (co-)occurring diseases/complications, e.g., dementia (PD/controls: 39/13%), depression (45/22%), bladder dysfunction (46/22%), and diabetes (35/31%), as well as mortality (10.7/5.8%) differed between PD cases an

5

## Add title and abstract to article_dict

In [42]:
#article_dict["title"].append(title)
#article_dict["abstract"].append(abstract)
print(article_dict)

{'title': ["Do We Need to Rethink the Epidemiology and Healthcare Utilization of Parkinson's Disease in Germany?", '[Parkinson Disease and Pain - diagnostic and therapeutic approaches to a challenging non-motor symptom].', "Effects of blackberry (Morus nigra) fruit juice on levodopa-induced dyskinesia in a mice model of Parkinson's disease.", 'Professional occupation and the risk of Parkinson Disease.', "Assessing the validity of proxy caregiver reporting for potential palliative care outcome measures in Parkinson's disease."], 'abstract': ["AbstractEpidemiological aspects of Parkinson's disease (PD), co-occurring diseases and medical healthcare utilization of PD patients are still largely elusive. Based on claims data of 3.7 million statutory insurance members in Germany in 2015 the prevalence and incidence of PD was determined. PD cases had at least one main hospital discharge diagnosis of PD, or one physician diagnosis confirmed by a subsequent or independent diagnosis or by PD medi

## Save article_dict to json

In [43]:
import json

In [44]:
json = json.dumps(article_dict)
f = open("parkinsons.json","w")
f.write(json)
f.close()

9899

In [45]:
# read in parkinsons json into a dataframe
df2 = pd.read_json('parkinsons.json')

In [47]:
# import json to sqlite
df2.to_sql(con=engine, index_label='id', name=Articles.__tablename__, if_exists='append')