## Pub Med Scraper

In [1]:
# scrape dependencies
import requests
import re
from bs4 import BeautifulSoup as bs

# data analysis dependencies
import pandas as pd
import numpy as np
import csv

# ipynb dependencies
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

# viz dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import datetime as dt
import time

In [2]:
# set the url to scrape
url = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease'
print(url)

https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease


In [3]:
# set up beautiful soup to scrape
response = requests.get(url)
soup = bs(response.text, 'html.parser')

In [4]:
# lets scrape the article titles
journals = soup.find_all("p", attrs={'class':'title'})

In [5]:
# searching for the journal titles
journals_len = len(journals)
print(f"There are {journals_len} journals to scrape on the first page.")

There are 20 journals to scrape on the first page.


In [6]:
# loop through journals to print titles
for i in range(0,20):
    journals[i].text.strip()

"Visual hallucinations in dementia and Parkinson's disease: A qualitative exploration of patient and caregiver experiences."

'Therapy With Mesenchymal Stem Cells in Parkinson Disease: History and Perspectives.'

'Level of uric acid and uric acid/creatinine ratios in correlation with stage of Parkinson disease.'

'Improvement During Inpatient Rehabilitation Among Older Adults with Guillain-Barré Syndrome, Multiple Sclerosis, Parkinson Disease, and Stroke.'

"Longitudinal white matter microstructural change in Parkinson's disease."

"Cognitive impairment in Parkinson's disease: a report from a multidisciplinary symposium on unmet needs and future directions to maintain cognitive health."

'Closed- and Open-loop Deep Brain Stimulation: Methods, Challenges, Current and Future Aspects.'

"Alzheimer 's Disease: Possible Mechanisms Behind Neurohormesis Induced by Exposure to Low Doses of Ionizing Radiation."

"Apathy following Bilateral Deep Brain Stimulation of Subthalamic Nucleus in Parkinson's Disease: A Meta-Analysis."

'Rating Scales for Movement Disorders With Sleep Disturbances: A Narrative Review.'

"Osteoarthritis Increases Paresthestic and Akathisic Pain, Anxiety Case-ness, and Depression Severity in Patients With Parkinson's Disease."

'Now is the Critical Time for Engineered Neuroplasticity.'

"Episodic memory decline in Parkinson' s disease: relation with white matter hyperintense lesions and influence of quantification method."

"Blunted Cardiovascular Responses to Exercise in Parkinson's disease Patients: Role of the Muscle Metaboreflex."

'Age- and disease-dependent increase of the mitophagy marker phospho-ubiquitin in normal aging and Lewy body disease.'

"Inflammation and fatigue in early, untreated Parkinson's Disease."

'Association of Retinal Neurodegeneration on Optical Coherence Tomography With Dementia: A Population-Based Study.'

'Effects of Deep Brain Stimulation on Eye Movements and Vestibular Function.'

"Virtual research visits and direct-to-consumer genetic testing in Parkinson's disease."

'[Status and development of the role as Parkinson Nurse in Germany - an online survey].'

### Set main url to concat with pubmed ids

In [7]:
# set the main url that we will concatanate with the pubmed id
main_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
print(main_url)

https://www.ncbi.nlm.nih.gov/pubmed/


In [8]:
# scrape journal links, delete first link
links = soup.find_all("p",attrs={'class':'links'})
links.pop(0)

<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29953689" ref="ordinalpos=1">Similar articles</a> </p>

In [9]:
# testing to see how my links / journals to scrape
print(len(links))

19


In [10]:
# set empty array to append all links to
links_all = []

# loop through links to convert to string
for i in range (0,19):
    links_all.append(str(links[i]))
    print(links[i])
    print("----------------------------------------------")

<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29953040" ref="ordinalpos=2">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29952939" ref="ordinalpos=3">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29952780" ref="ordinalpos=4">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29952102" ref="ordinalpos=5">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29951580" ref="ordinalpos=6">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_ui

In [11]:
# slice through links_all to test
len(links_all)
links_all[1]

19

'<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29952939" ref="ordinalpos=3">Similar articles</a> </p>'

In [12]:
# create empty list to append pubmed_ids to
pubmed_ids = []

# loop through links all and use regex to grab the id numbers
for i in range (len(links_all)):
    pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))

In [13]:
# print out info for pubmed_ids
len(pubmed_ids)
type(pubmed_ids)
print(pubmed_ids)

19

list

[['29953040'], ['29952939'], ['29952780'], ['29952102'], ['29951580'], ['29951448'], ['29951441'], ['29951186'], ['29951032'], ['29951029'], ['29948920'], ['29948903'], ['29947592'], ['29947276'], ['29947088'], ['29946702'], ['29946295'], ['29942542'], ['29944066']]


In [14]:
# use itertools to transform pubmed ids from an array withn an array into one list
import itertools
pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))

In [15]:
# slice through pubmed_merged to see what itertools did
pubmed_merged[0]

'29953040'

In [16]:
# concat main_url with a slice of pubmed_merged before we loop
print(main_url + str(pubmed_merged[0]))

https://www.ncbi.nlm.nih.gov/pubmed/29953040


In [17]:
links_all = []

for i in range (len(pubmed_merged)):
    links_all.append(main_url + str(pubmed_merged[i]))

## Main array of links to scrape:

Here we use selenium to iterate through these links. Seleium will click on each link then scrape the title and abstracts on each page. 

In [18]:
for i in links_all:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/29953040
https://www.ncbi.nlm.nih.gov/pubmed/29952939
https://www.ncbi.nlm.nih.gov/pubmed/29952780
https://www.ncbi.nlm.nih.gov/pubmed/29952102
https://www.ncbi.nlm.nih.gov/pubmed/29951580
https://www.ncbi.nlm.nih.gov/pubmed/29951448
https://www.ncbi.nlm.nih.gov/pubmed/29951441
https://www.ncbi.nlm.nih.gov/pubmed/29951186
https://www.ncbi.nlm.nih.gov/pubmed/29951032
https://www.ncbi.nlm.nih.gov/pubmed/29951029
https://www.ncbi.nlm.nih.gov/pubmed/29948920
https://www.ncbi.nlm.nih.gov/pubmed/29948903
https://www.ncbi.nlm.nih.gov/pubmed/29947592
https://www.ncbi.nlm.nih.gov/pubmed/29947276
https://www.ncbi.nlm.nih.gov/pubmed/29947088
https://www.ncbi.nlm.nih.gov/pubmed/29946702
https://www.ncbi.nlm.nih.gov/pubmed/29946295
https://www.ncbi.nlm.nih.gov/pubmed/29942542
https://www.ncbi.nlm.nih.gov/pubmed/29944066


## Regex Notes

In [19]:
# Regex
# Identifiers:
# \d any number
# \D anything but a number
# \s space
# \S anything but a space
# \w any character
# \W anything but a character
# . any character, except for a newline
# \b the whitespace around words
# \. a period

# Modifiers:
# {1,3} we're expecting 1-3 \d{1-3}
# + Match 1 or more
# ? Match 0 or more
# * Match 0 or more
# $ Match the end of a string
# ^ matching the beginning of a string
# | either or
# [] range or "variance" [A-Za-z] [1-5a-qA-Z]
# {x} expecting "x" amount

# White Space Characters: 
# \n new line
# \s space
# \t tab
# \e escape
# \f form feed
# \r return

# DONT FORGET!:
# . + * [] $ ^ () {} | \

## Selenium
**Web Browser Automation**

In [20]:
from splinter import Browser
from selenium import webdriver

In [21]:
# make sure chrome browser exe is in current directory
# chrome browser exe is not necessary for MACS
executable_path = {'executable_path': 'chromedriver'}

In [22]:
# test with one link
# hard code article_one
article_one = "https://www.ncbi.nlm.nih.gov/pubmed/29953040"

In [23]:
browser = Browser('chrome', headless=False)
html = browser.html
response2 = requests.get(article_one)
soup2 = bs(response2.text, 'html.parser')

In [24]:
browser.visit(article_one)

In [25]:
# scrapes both article title and abstract
test5 = soup2.find("div", attrs={'class': 'rprt_all'}).text.strip()

In [26]:
# there are two 'h1' tags on this page. slice out index 0
title_one = soup2.find_all('h1')

## Article Title

In [27]:
article_one_title = title_one[1].text.strip()
article_one_title

'Therapy With Mesenchymal Stem Cells in Parkinson Disease: History and Perspectives.'

In [28]:
# grabs the entire abstracted nested with the <p> tag
text_main = soup2.find_all('p')

## Abstract

In [29]:
abstract = text_main[9].text.strip()
abstract

'Parkinson disease (PD) is a neurodegenerative disorder affecting the basal nuclei, causing motor and cognitive disorders. Bearing in mind that standard treatments are ineffective in delaying the disease progression, alternative treatments capable of eliminating symptoms and reversing the clinical condition have been sought. Possible alternative treatments include cell therapy, especially with the use of mesenchymal stem cells (MSC).'

## Review Summary

In [30]:
review_summary = text_main[10].text.strip()
review_summary

'MSC are adult stem cells which have demonstrated remarkable therapeutic power in parkinsonian animals due to their differentiation competence, migratory capacity and the production of bioactive molecules. This review aims to analyze the main studies involving MSC and PD in more than a decade of studies, addressing their different methodologies and common characteristics, as well as suggesting perspectives on the application of MSC in PD.'

## Conclusion

In [31]:
conclusion = text_main[11].text.strip()
conclusion

'The results of MSC therapy in animal models and some clinical trials suggest that such cellular therapy may slow the progression of PD and promote neuroregeneration. However, further research is needed to address the limitations of an eventual clinical application.'