## Pub Med Scraper

In [1]:
# scrape dependencies
import requests
import re
from bs4 import BeautifulSoup as bs

# data analysis dependencies
import pandas as pd
import numpy as np
import csv

# ipynb dependencies
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

# viz dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import datetime as dt
import time

In [2]:
# set the url to scrape
url = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease'
print(url)

https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease


In [3]:
# set up beautiful soup to scrape
response = requests.get(url)
soup = bs(response.text, 'html.parser')

In [4]:
# lets scrape the article titles
journals = soup.find_all("p", attrs={'class':'title'})

In [5]:
# searching for the journal titles
journals_len = len(journals)
print(f"There are {journals_len} journals to scrape on the first page.")

There are 20 journals to scrape on the first page.


In [7]:
# loop through journals to print titles
for i in range(0,20):
    journals[i].text.strip()

'Modeling, Detecting, and Tracking Freezing of Gait in Parkinson Disease using Inertial Sensors.'

'The lysosomal membrane protein LAMP2A promotes autophagic flux and prevents SNCA-induced Parkinson disease-like symptoms in the Drosophila brain.'

"LRRK2 G2019S Parkinson's disease with more benign phenotype than idiopathic."

"Bioactive constituents from cinnamon, hemp seed and polygonum cuspidatum protect against H2O2 but not rotenone toxicity in a cellular model of Parkinson's disease."

"Dilated Virchow-Robin space and Parkinson's disease: A case report of combined MRI and diffusion tensor imaging."

"Personal computer-based cognitive training in Parkinson's disease: a case study."

'Angiotensin Type 1 Receptor Antagonists Protect Against Alpha-Synuclein-Induced Neuroinflammation and Dopaminergic Neuron Death.'

'Quantitative Intensity Harmonization of Dopamine Transporter SPECT Images Using Gamma Mixture Models.'

"Inflammation: a highly conserved, Janus-like phenomenon-a gastroenterologist' perspective."

'Author response: A predictive model to identify Parkinson disease from administrative claims data.'

'Reader response: A predictive model to identify Parkinson disease from administrative claims data.'

"Editors' note: A predictive model to identify Parkinson disease from administrative claims data."

'Deliberate gait scissoring to overcome freezing of gait in Parkinson disease.'

'Association of Low Lysosomal Enzymes Activity With Brain Arterial Dilatation: A Pilot Study.'

"Quantitative assessment of upper limb functional impairments in people with Parkinson's disease."

'The efficacy of repetitive transcranial magnetic stimulation for Parkinson disease patients with depression.'

'Age-dependent determinants of antipsychotic use among newly admitted residents of skilled nursing facilities: A population-based study.'

'Beta-band oscillations in the supplementary motor cortex are modulated by levodopa and associated with functional activity in the basal ganglia.'

"Inflammatory bowel disease and risk of Parkinson's disease in medicare beneficiaries."

"Does nephrotic syndrome without chronic kidney disease increase the risk of Parkinson's disease and secondary parkinsonism? A nationwide population-based study in Taiwan."

## Set main url to concat with pubmed ids

In [8]:
# set the main url that we will concatanate with the pubmed id
main_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
print(main_url)

https://www.ncbi.nlm.nih.gov/pubmed/


<br>
<br>
# TESTING SECTION ------------------------------------------------------------------

In [9]:
# set empty links_all list to append to 
links_all = []

# set pubmed ids list to append to
pubmed_ids = []

# set empty list to append scrape_links to
scrape_links = []

In [10]:
# use bs to scarpe p tags with class - title
links = soup.find_all("p",attrs={'class':'title'})
print(links[0])

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29989948" ref="ordinalpos=1&amp;ncbi_uid=29989948&amp;link_uid=29989948&amp;linksrc=docsum_title">Modeling, Detecting, and Tracking Freezing of Gait in <b>Parkinson Disease</b> using Inertial Sensors.</a></p>


In [11]:
# testing to see how my links / journals to scrape
articles_to_scrape = len(links)
print(f"There are {articles_to_scrape} articles to scrape.")
print("----------------------------------------------")

There are 20 articles to scrape.
----------------------------------------------


In [12]:
# loop through links to convert to string
for i in range (len(links)):
    links_all.append(str(links[i]))
    print(links[i])
    print("----------------------------------------------")

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29989948" ref="ordinalpos=1&amp;ncbi_uid=29989948&amp;link_uid=29989948&amp;linksrc=docsum_title">Modeling, Detecting, and Tracking Freezing of Gait in <b>Parkinson Disease</b> using Inertial Sensors.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29989488" ref="ordinalpos=2&amp;ncbi_uid=29989488&amp;link_uid=29989488&amp;linksrc=docsum_title">The lysosomal membrane protein LAMP2A promotes autophagic flux and prevents SNCA-induced <b>Parkinson disease</b>-like symptoms in the Drosophila brain.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29989150" ref="ordinalpos=3&amp;ncbi_uid=29989150&amp;link_uid=29989150&amp;linksrc=docsum_title">LRRK2 G2019S <b>Parkinson</b>'s <b>disease</b> with more benign phenotype than idiopathic.</a></p

In [21]:
# for i in links_all:
#     print(i)
#     print("----------------------------------------------")

In [24]:
# slice through links_all to test
len(links_all)
links_all[5]

20

'<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29987868" ref="ordinalpos=6&amp;ncbi_uid=29987868&amp;link_uid=29987868&amp;linksrc=docsum_title">Personal computer-based cognitive training in <b>Parkinson</b>\'s <b>disease</b>: a case study.</a></p>'

### Regex test

In [30]:
regex_test = '<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29987868" ref="ordinalpos=6&amp;ncbi_uid=29987868&amp;link_uid=29987868&amp;linksrc=docsum_title">Personal computer-based cognitive training in <b>Parkinson</b>\'s <b>disease</b>: a case study.</a></p>'

In [31]:
print(regex_test)

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29987868" ref="ordinalpos=6&amp;ncbi_uid=29987868&amp;link_uid=29987868&amp;linksrc=docsum_title">Personal computer-based cognitive training in <b>Parkinson</b>'s <b>disease</b>: a case study.</a></p>


In [32]:
re.findall(r'\d{8}',regex_test)

['29987868', '29987868', '29987868']

### End Regex Test

In [33]:
# loop through links all and use regex to grab the id numbers
for i in range (len(links_all)):
    pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))

In [34]:
# print out info for pubmed_ids
len(pubmed_ids)
type(pubmed_ids)
print(pubmed_ids)
print("----------------------------------------------")

20

list

[['29989948', '29989948', '29989948'], ['29989488', '29989488', '29989488'], ['29989150', '29989150', '29989150'], ['29989058', '29989058', '29989058'], ['29988793', '29988793', '29988793'], ['29987868', '29987868', '29987868'], ['29987762', '29987762', '29987762'], ['29987621', '29987621', '29987621'], ['29987405', '29987405', '29987405'], ['29987183', '29987183', '29987183'], ['29987182', '29987182', '29987182'], ['29987181', '29987181', '29987181'], ['29987176', '29987176', '29987176'], ['29986930', '29986930', '29986930'], ['29986276', '29986276', '29986276'], ['29985089', '29985089', '29985089'], ['29984493', '29984493', '29984493'], ['29984164', '29984164', '29984164'], ['29983328', '29983328', '29983328'], ['29982207', '29982207', '29982207']]
----------------------------------------------


In [35]:
# use itertools to transform pubmed ids from an array withn an array into one list
import itertools
pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))

In [42]:
# slice through pubmed_merged to see what itertools did
pubmed_merged[0]
print(pubmed_merged)

'29989948'

['29989948', '29989948', '29989948', '29989488', '29989488', '29989488', '29989150', '29989150', '29989150', '29989058', '29989058', '29989058', '29988793', '29988793', '29988793', '29987868', '29987868', '29987868', '29987762', '29987762', '29987762', '29987621', '29987621', '29987621', '29987405', '29987405', '29987405', '29987183', '29987183', '29987183', '29987182', '29987182', '29987182', '29987181', '29987181', '29987181', '29987176', '29987176', '29987176', '29986930', '29986930', '29986930', '29986276', '29986276', '29986276', '29985089', '29985089', '29985089', '29984493', '29984493', '29984493', '29984164', '29984164', '29984164', '29983328', '29983328', '29983328', '29982207', '29982207', '29982207']


In [43]:
# concat main_url with a slice of pubmed_merged before we loop
print(main_url + str(pubmed_merged[0]))

https://www.ncbi.nlm.nih.gov/pubmed/29989948


In [44]:
# append merged links to links_all
for i in range (len(pubmed_merged)):
    scrape_links.append(main_url + str(pubmed_merged[i]))

In [46]:
for i in scrape_links:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/29989948
https://www.ncbi.nlm.nih.gov/pubmed/29989948
https://www.ncbi.nlm.nih.gov/pubmed/29989948
https://www.ncbi.nlm.nih.gov/pubmed/29989488
https://www.ncbi.nlm.nih.gov/pubmed/29989488
https://www.ncbi.nlm.nih.gov/pubmed/29989488
https://www.ncbi.nlm.nih.gov/pubmed/29989150
https://www.ncbi.nlm.nih.gov/pubmed/29989150
https://www.ncbi.nlm.nih.gov/pubmed/29989150
https://www.ncbi.nlm.nih.gov/pubmed/29989058
https://www.ncbi.nlm.nih.gov/pubmed/29989058
https://www.ncbi.nlm.nih.gov/pubmed/29989058
https://www.ncbi.nlm.nih.gov/pubmed/29988793
https://www.ncbi.nlm.nih.gov/pubmed/29988793
https://www.ncbi.nlm.nih.gov/pubmed/29988793
https://www.ncbi.nlm.nih.gov/pubmed/29987868
https://www.ncbi.nlm.nih.gov/pubmed/29987868
https://www.ncbi.nlm.nih.gov/pubmed/29987868
https://www.ncbi.nlm.nih.gov/pubmed/29987762
https://www.ncbi.nlm.nih.gov/pubmed/29987762
https://www.ncbi.nlm.nih.gov/pubmed/29987762
https://www.ncbi.nlm.nih.gov/pubmed/29987621
https://ww

# END TESTING SECTION------------------------------------------------------------
<br>
<br>

## Function to create array of links to scrape

In [None]:
# set empty links_all list to append to 
links_all = []

# set pubmed ids list to append to
pubmed_ids = []

# set empty list to append scrape_links to
scrape_links = []

# function to get links
def get_links(main_url):
    
    # use bs to scarpe p tags with class - title
    links = soup.find_all("p",attrs={'class':'title'})
      
    # testing to see how my links / journals to scrape
    articles_to_scrape = len(links)
    print(f"There are {articles_to_scrape} articles to scrape.")
    print("----------------------------------------------")
    
    # loop through links to convert to string
    for i in range (len(links)):
        links_all.append(str(links[i]))
        print(links[i])
        print("----------------------------------------------")
        
    # slice through links_all to test
    len(links_all)
    links_all[1]
    
    # loop through links all and use regex to grab the id numbers
    for i in range (len(links_all)):
        pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))
    
    # print out info for pubmed_ids
    len(pubmed_ids)
    type(pubmed_ids)
    print(pubmed_ids)
    print("----------------------------------------------")
    
    # use itertools to transform pubmed ids from an array withn an array into one list
    import itertools
    pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))
    
    # slice through pubmed_merged to see what itertools did
    pubmed_merged[0]
    
    # concat main_url with a slice of pubmed_merged before we loop
    print(main_url + str(pubmed_merged[0]))
    
    # append merged links to links_all
    for i in range (len(pubmed_merged)):
        scrape_links.append(main_url + str(pubmed_merged[i]))

In [None]:
# RUN FUNCTION
get_links(main_url)

<br>
There are duplicates in our **scrape_links** array. Use `list` to delete the duplicates.

In [47]:
# delete duplicates in scrape_links and assign to new variable scrape_links_final
scrape_links_final = list(set(scrape_links))
len(scrape_links_final)
scrape_links_final

20

['https://www.ncbi.nlm.nih.gov/pubmed/29988793',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987182',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987183',
 'https://www.ncbi.nlm.nih.gov/pubmed/29986276',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987762',
 'https://www.ncbi.nlm.nih.gov/pubmed/29985089',
 'https://www.ncbi.nlm.nih.gov/pubmed/29984164',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987181',
 'https://www.ncbi.nlm.nih.gov/pubmed/29983328',
 'https://www.ncbi.nlm.nih.gov/pubmed/29989058',
 'https://www.ncbi.nlm.nih.gov/pubmed/29989948',
 'https://www.ncbi.nlm.nih.gov/pubmed/29989150',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987868',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987405',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987176',
 'https://www.ncbi.nlm.nih.gov/pubmed/29982207',
 'https://www.ncbi.nlm.nih.gov/pubmed/29989488',
 'https://www.ncbi.nlm.nih.gov/pubmed/29986930',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987621',
 'https://www.ncbi.nlm.nih.gov/pubmed/29984493']

## Main array of links to scrape:

Here we use selenium to iterate through these links. Seleium will click on each link then scrape the title and abstracts on each page. 

In [48]:
# testing scrape_links
for i in scrape_links_final:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/29988793
https://www.ncbi.nlm.nih.gov/pubmed/29987182
https://www.ncbi.nlm.nih.gov/pubmed/29987183
https://www.ncbi.nlm.nih.gov/pubmed/29986276
https://www.ncbi.nlm.nih.gov/pubmed/29987762
https://www.ncbi.nlm.nih.gov/pubmed/29985089
https://www.ncbi.nlm.nih.gov/pubmed/29984164
https://www.ncbi.nlm.nih.gov/pubmed/29987181
https://www.ncbi.nlm.nih.gov/pubmed/29983328
https://www.ncbi.nlm.nih.gov/pubmed/29989058
https://www.ncbi.nlm.nih.gov/pubmed/29989948
https://www.ncbi.nlm.nih.gov/pubmed/29989150
https://www.ncbi.nlm.nih.gov/pubmed/29987868
https://www.ncbi.nlm.nih.gov/pubmed/29987405
https://www.ncbi.nlm.nih.gov/pubmed/29987176
https://www.ncbi.nlm.nih.gov/pubmed/29982207
https://www.ncbi.nlm.nih.gov/pubmed/29989488
https://www.ncbi.nlm.nih.gov/pubmed/29986930
https://www.ncbi.nlm.nih.gov/pubmed/29987621
https://www.ncbi.nlm.nih.gov/pubmed/29984493


In [60]:
# slice out scrape_links_final so we can scrape 5 articles at a time
links_1 = scrape_links_final[0:5]
links_1

links_2 = scrape_links_final[5:10]
links_2

links_3 = scrape_links_final[10:15]
links_3

links_4 = scrape_links_final[15:20]
links_4

['https://www.ncbi.nlm.nih.gov/pubmed/29988793',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987182',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987183',
 'https://www.ncbi.nlm.nih.gov/pubmed/29986276',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987762']

['https://www.ncbi.nlm.nih.gov/pubmed/29985089',
 'https://www.ncbi.nlm.nih.gov/pubmed/29984164',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987181',
 'https://www.ncbi.nlm.nih.gov/pubmed/29983328',
 'https://www.ncbi.nlm.nih.gov/pubmed/29989058']

['https://www.ncbi.nlm.nih.gov/pubmed/29989948',
 'https://www.ncbi.nlm.nih.gov/pubmed/29989150',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987868',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987405',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987176']

['https://www.ncbi.nlm.nih.gov/pubmed/29982207',
 'https://www.ncbi.nlm.nih.gov/pubmed/29989488',
 'https://www.ncbi.nlm.nih.gov/pubmed/29986930',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987621',
 'https://www.ncbi.nlm.nih.gov/pubmed/29984493']

## Regex Notes

In [None]:
# Regex
# Identifiers:
# \d any number
# \D anything but a number
# \s space
# \S anything but a space
# \w any character
# \W anything but a character
# . any character, except for a newline
# \b the whitespace around words
# \. a period

# Modifiers:
# {1,3} we're expecting 1-3 \d{1-3}
# + Match 1 or more
# ? Match 0 or more
# * Match 0 or more
# $ Match the end of a string
# ^ matching the beginning of a string
# | either or
# [] range or "variance" [A-Za-z] [1-5a-qA-Z]
# {x} expecting "x" amount

# White Space Characters: 
# \n new line
# \s space
# \t tab
# \e escape
# \f form feed
# \r return

# DONT FORGET!:
# . + * [] $ ^ () {} | \

## Selenium
**Web Browser Automation**

In [50]:
from splinter import Browser
from selenium import webdriver

In [51]:
# make sure chrome browser exe is in current directory
# chrome browser exe is not necessary for MACS
executable_path = {'executable_path': 'chromedriver'}

## Set up dictionary to append data to

In [52]:
article_dict = {"title": [],
               "abstract": []}

## Create get_article_info function

In [2]:
title = []
abstract = []

def get_article_info(links_1):
    
    # iterate through articles
    for i in links_1:
        
        # sets up scraper
        browser = Browser('chrome', headless=False)
        html = browser.html
        response2 = requests.get(i)
        soup2 = bs(response2.text, 'html.parser')
    
        browser.visit(i)
    
        # there are two 'h1' tags on this page. slice out index 0
        title_one = soup2.find_all('h1')
        article_one_title = title_one[1].text.strip()
    
        # slice h1 at index 1 to grab article title
        title.append(article_one_title)
    
        # get abstract 
        # abstract.append(soup2.find("div", attrs={'class': 'rprt_all'}).text.strip())     
        abstract.append(soup2.find("div", attrs={'class': 'rprt abstract'}).text.strip())

## Run article_info function

In [65]:
get_article_info(links_1)

In [68]:
get_article_info(links_2)

In [69]:
get_article_info(links_3)

In [70]:
get_article_info(links_4)

In [3]:
for i in title:
    print(i + "\n")

len(title)

0

In [72]:
for i in abstract:
    print(i)
    print("\n")

len(abstract)

Radiol Case Rep. 2018 Jun 30;13(4):871-877. doi: 10.1016/j.radcr.2018.05.011. eCollection  2018 Aug.Dilated Virchow-Robin space and Parkinson's disease: A case report of combined MRI and diffusion tensor imaging.Conforti R1, Sardaro A2, Negro A1, Caiazzo G3, Paccone A3, De Micco R4, Cirillo S1, Tessitore A4.Author information1Università degli Studi della Campania Luigi Vanvitelli, Section of Neuroradiology, viale Colli Aminei 21, Napoli 80131, Italy.2Università degli Studi della Campania Luigi Vanvitelli, piazza Miraglia 2, Napoli 80138, Italy.3MRI Research Center SUN-FISM, Università degli Studi della Campania Luigi Vanvitelli, piazza Miraglia 2, Napoli 80138, Italy.4Università degli Studi della Campania Luigi Vanvitelli, Dipartimento di Scienze Mediche, Chirurgiche, Neurologiche, Metaboliche e dell'Invecchiamento, piazza Miraglia 2, Napoli 80138, Italy.AbstractIn this manuscript we report the case of a 69-year-old female patient, who suffers from Parkinson's disease (PD) with a dilat

20

## Add title and abstract to article_dict

In [73]:
article_dict["title"].append(title)
article_dict["abstract"].append(abstract)
print(article_dict)

{'title': [["Dilated Virchow-Robin space and Parkinson's disease: A case report of combined MRI and diffusion tensor imaging.", 'Reader response: A predictive model to identify Parkinson disease from administrative claims data.', 'Author response: A predictive model to identify Parkinson disease from administrative claims data.', "Quantitative assessment of upper limb functional impairments in people with Parkinson's disease.", 'Angiotensin Type 1 Receptor Antagonists Protect Against Alpha-Synuclein-Induced Neuroinflammation and Dopaminergic Neuron Death.', 'The efficacy of repetitive transcranial magnetic stimulation for Parkinson disease patients with depression.', 'Beta-band oscillations in the supplementary motor cortex are modulated by levodopa and associated with functional activity in the basal ganglia.', "Editors' note: A predictive model to identify Parkinson disease from administrative claims data.", "Inflammatory bowel disease and risk of Parkinson's disease in medicare bene

## Save article_dict to json

In [74]:
import json

In [75]:
json = json.dumps(article_dict)
f = open("parkinsons.json","w")
f.write(json)
f.close()

53904

<br>
<br>
<br>