## Pub Med Scraper

In [1]:
# scrape dependencies
import requests
import re
from bs4 import BeautifulSoup as bs

# data analysis dependencies
import pandas as pd
import numpy as np
import csv

# ipynb dependencies
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

# viz dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import datetime as dt
import time

In [2]:
# set the url to scrape
url = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease'
print(url)

https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease


In [3]:
# set up beautiful soup to scrape
response = requests.get(url)
soup = bs(response.text, 'html.parser')

In [4]:
# lets scrape the article titles
journals = soup.find_all("p", attrs={'class':'title'})

In [5]:
# searching for the journal titles
journals_len = len(journals)
print(f"There are {journals_len} journals to scrape on the first page.")

There are 20 journals to scrape on the first page.


In [6]:
# loop through journals to print titles
for i in range(0,20):
    journals[i].text.strip()

'Now is the Critical Time for Engineered Neuroplasticity.'

"Episodic memory decline in Parkinson' s disease: relation with white matter hyperintense lesions and influence of quantification method."

"Blunted Cardiovascular Responses to Exercise in Parkinson's disease Patients: Role of the Muscle Metaboreflex."

'Age- and disease-dependent increase of the mitophagy marker phospho-ubiquitin in normal aging and Lewy body disease.'

"Inflammation and fatigue in early, untreated Parkinson's Disease."

'Association of Retinal Neurodegeneration on Optical Coherence Tomography With Dementia: A Population-Based Study.'

'Effects of Deep Brain Stimulation on Eye Movements and Vestibular Function.'

"Virtual research visits and direct-to-consumer genetic testing in Parkinson's disease."

'[Status and development of the role as Parkinson Nurse in Germany - an online survey].'

"Beyond 35\xa0years of Parkinson's disease: a comprehensive clinical and instrumental assessment."

'TNF inhibits catecholamine production from induced sympathetic neuron-like cells in rheumatoid arthritis and osteoarthritis in vitro.'

'Structure-Activity Relationship of Cannabis Derived Compounds for the Treatment of Neuronal Activity-Related Diseases.'

"Pallidal Stimulation Modulates Pedunculopontine Nuclei in Parkinson's Disease."

'Attenuated dopaminergic neurodegeneration and motor dysfunction in hemiparkinsonian mice lacking the α5 nicotinic acetylcholine receptor subunit.'

'Lipophilic antioxidants in neurodegenerative diseases.'

"Experience of care for Parkinson's disease in European countries: A survey by the European Parkinson's Disease Association."

'Nutrition and Nutraceuticals in Neuroinflammatory and Brain Metabolic Stress: Implications for Neurodegenerative Disorders.'

"Retinal changes in Parkinson's disease and glaucoma."

'Complementary Medicine in Parkinson Disease: Once Again, Surprisingly Effective.'

"Progression Rate Associated Peripheral Blood Biomarkers of Parkinson's Disease."

In [7]:
# set the main url that we will concatanate with the pubmed id
main_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
print(main_url)

https://www.ncbi.nlm.nih.gov/pubmed/


In [8]:
# scrape journal links
links = soup.find_all("p",attrs={'class':'links'})
links.pop(0)

<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29948920" ref="ordinalpos=1">Similar articles</a> </p>

In [9]:
# testing to see how my links / journals to scrape
print(len(links))

19


In [10]:
# set empty array to append all links to
links_all = []

# loop through links to convert to string
for i in range (0,19):
    links_all.append(str(links[i]))
    print(links[i])
    print("----------------------------------------------")

<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29948903" ref="ordinalpos=2">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29947592" ref="ordinalpos=3">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29947276" ref="ordinalpos=4">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29947088" ref="ordinalpos=5">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29946702" ref="ordinalpos=6">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_ui

In [11]:
# slice through links_all to test
len(links_all)
links_all[1]

19

'<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29947592" ref="ordinalpos=3">Similar articles</a> </p>'

In [12]:
# create empty list to append pubmed_ids to
pubmed_ids = []

# loop through links all and use regex to grab the id numbers
for i in range (len(links_all)):
    pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))

In [13]:
# print out info for pubmed_ids
len(pubmed_ids)
type(pubmed_ids)
print(pubmed_ids)

19

list

[['29948903'], ['29947592'], ['29947276'], ['29947088'], ['29946702'], ['29946295'], ['29942542'], ['29944066'], ['29943201'], ['29941879'], ['29941830'], ['29941788'], ['29940207'], ['29940147'], ['29939446'], ['29938622'], ['29937099'], ['29937088'], ['29936662']]


In [14]:
# use itertools to transform pubmed ids from an array withn an array into one list
import itertools
pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))

In [15]:
# slice through pubmed_merged to see what itertools did
pubmed_merged[0]

'29948903'

In [16]:
# concat main_url with a slice of pubmed_merged before we loop
print(main_url + str(pubmed_merged[0]))

https://www.ncbi.nlm.nih.gov/pubmed/29948903


In [17]:
links_all = []

for i in range (len(pubmed_merged)):
    links_all.append(main_url + str(pubmed_merged[i]))

## Main array of links to scrape:

Here we use selenium to iterate through these links. Seleium will click on each link then scrape the title and abstracts on each page. 

In [18]:
for i in links_all:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/29948903
https://www.ncbi.nlm.nih.gov/pubmed/29947592
https://www.ncbi.nlm.nih.gov/pubmed/29947276
https://www.ncbi.nlm.nih.gov/pubmed/29947088
https://www.ncbi.nlm.nih.gov/pubmed/29946702
https://www.ncbi.nlm.nih.gov/pubmed/29946295
https://www.ncbi.nlm.nih.gov/pubmed/29942542
https://www.ncbi.nlm.nih.gov/pubmed/29944066
https://www.ncbi.nlm.nih.gov/pubmed/29943201
https://www.ncbi.nlm.nih.gov/pubmed/29941879
https://www.ncbi.nlm.nih.gov/pubmed/29941830
https://www.ncbi.nlm.nih.gov/pubmed/29941788
https://www.ncbi.nlm.nih.gov/pubmed/29940207
https://www.ncbi.nlm.nih.gov/pubmed/29940147
https://www.ncbi.nlm.nih.gov/pubmed/29939446
https://www.ncbi.nlm.nih.gov/pubmed/29938622
https://www.ncbi.nlm.nih.gov/pubmed/29937099
https://www.ncbi.nlm.nih.gov/pubmed/29937088
https://www.ncbi.nlm.nih.gov/pubmed/29936662


## Regex Notes

In [19]:
# Regex
# Identifiers:
# \d any number
# \D anything but a number
# \s space
# \S anything but a space
# \w any character
# \W anything but a character
# . any character, except for a newline
# \b the whitespace around words
# \. a period

# Modifiers:
# {1,3} we're expecting 1-3 \d{1-3}
# + Match 1 or more
# ? Match 0 or more
# * Match 0 or more
# $ Match the end of a string
# ^ matching the beginning of a string
# | either or
# [] range or "variance" [A-Za-z] [1-5a-qA-Z]
# {x} expecting "x" amount

# White Space Characters: 
# \n new line
# \s space
# \t tab
# \e escape
# \f form feed
# \r return

# DONT FORGET!:
# . + * [] $ ^ () {} | \