## Pub Med Scraper

In [1]:
# scrape dependencies
import requests
import re
from bs4 import BeautifulSoup as bs

# data analysis dependencies
import pandas as pd
import numpy as np
import csv

# ipynb dependencies
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

# viz dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import datetime as dt
import time

In [2]:
# set the url to scrape
url = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease'
print(url)

https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease


In [3]:
# set up beautiful soup to scrape
response = requests.get(url)
soup = bs(response.text, 'html.parser')

In [4]:
# lets scrape the article titles
journals = soup.find_all("p", attrs={'class':'title'})

In [5]:
# searching for the journal titles
journals_len = len(journals)
print(f"There are {journals_len} journals to scrape on the first page.")

There are 20 journals to scrape on the first page.


In [6]:
# loop through journals to print titles
for i in range(0,20):
    journals[i].text.strip()

"The factors associated with impulse control behaviors in Parkinson's disease: A 2-year longitudinal retrospective cohort study."

"Evaluation of Linguistic Markers of Word-Finding Difficulty and Cognition in Parkinson's Disease."

'Implementation and evaluation of Parkinson disease management in an outpatient clinical pharmacist-run neurology telephone clinic.'

'Treatment of psychotic symptoms in patients with Parkinson disease.'

'Pimavanserin (Nuplazid™) for the treatment of Parkinson disease psychosis: A review of the literature.'

'Drug-induced parkinsonism: A case report.'

'Evidence for the use of "medical marijuana" in psychiatric and neurologic disorders.'

'Interaction between Monoamine Oxidase B Inhibitors and Selective Serotonin Reuptake Inhibitors.'

'Comparative Study of MRI Biomarkers in the Substantia Nigra to Discriminate Idiopathic Parkinson Disease.'

"Visual hallucinations in dementia and Parkinson's disease: A qualitative exploration of patient and caregiver experiences."

'Therapy With Mesenchymal Stem Cells in Parkinson Disease: History and Perspectives.'

'Level of uric acid and uric acid/creatinine ratios in correlation with stage of Parkinson disease.'

'Improvement During Inpatient Rehabilitation Among Older Adults with Guillain-Barré Syndrome, Multiple Sclerosis, Parkinson Disease, and Stroke.'

"Longitudinal white matter microstructural change in Parkinson's disease."

"Cognitive impairment in Parkinson's disease: a report from a multidisciplinary symposium on unmet needs and future directions to maintain cognitive health."

'Closed- and Open-loop Deep Brain Stimulation: Methods, Challenges, Current and Future Aspects.'

"Alzheimer 's Disease: Possible Mechanisms Behind Neurohormesis Induced by Exposure to Low Doses of Ionizing Radiation."

"Apathy following Bilateral Deep Brain Stimulation of Subthalamic Nucleus in Parkinson's Disease: A Meta-Analysis."

'Rating Scales for Movement Disorders With Sleep Disturbances: A Narrative Review.'

"Osteoarthritis Increases Paresthestic and Akathisic Pain, Anxiety Case-ness, and Depression Severity in Patients With Parkinson's Disease."

### Set main url to concat with pubmed ids

In [7]:
# set the main url that we will concatanate with the pubmed id
main_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
print(main_url)

https://www.ncbi.nlm.nih.gov/pubmed/


In [8]:
# set empty links_all list to append to 
links_all = []

# set pubmed ids list to append to
pubmed_ids = []

# set empty list to append scrape_links to
scrape_links = []


# function to get links
def get_links(main_url):
    
    links = soup.find_all("p",attrs={'class':'links'})
    links.pop(0)
    
    # testing to see how my links / journals to scrape
    articles_to_scrape = len(links)
    print(f"There are {articles_to_scrape} articles to scrape.")
    print("----------------------------------------------")
    
    # loop through links to convert to string
    for i in range (len(links)):
        links_all.append(str(links[i]))
        print(links[i])
        print("----------------------------------------------")
        
    # slice through links_all to test
    len(links_all)
    links_all[1]
    
    # loop through links all and use regex to grab the id numbers
    for i in range (len(links_all)):
        pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))
    
    # print out info for pubmed_ids
    len(pubmed_ids)
    type(pubmed_ids)
    print(pubmed_ids)
    
    # use itertools to transform pubmed ids from an array withn an array into one list
    import itertools
    pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))
    
    # slice through pubmed_merged to see what itertools did
    pubmed_merged[0]
    
    # concat main_url with a slice of pubmed_merged before we loop
    print(main_url + str(pubmed_merged[0]))
    
    # append merged links to links_all
    for i in range (len(pubmed_merged)):
        scrape_links.append(main_url + str(pubmed_merged[i]))
    
    # print out links_all
    for i in scrape_links:
        print(i)
    
    # return links_all
    return scrape_links   

In [9]:
get_links(main_url)

There are 19 articles to scrape.
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29955824" ref="ordinalpos=2">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29955562" ref="ordinalpos=3">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29955532" ref="ordinalpos=4">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29955528" ref="ordinalpos=5">Similar articles</a> </p>
----------------------------------------------
<p class="links nohighlight"><a href="/pubmed?linkname=pubmed_pubmed&amp;from_uid=29955500" ref="ordinalpos=6">Similar articles</a> </p>
----------------------------------------------


['https://www.ncbi.nlm.nih.gov/pubmed/29955824',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955562',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955532',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955528',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955500',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955495',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955193',
 'https://www.ncbi.nlm.nih.gov/pubmed/29954816',
 'https://www.ncbi.nlm.nih.gov/pubmed/29953689',
 'https://www.ncbi.nlm.nih.gov/pubmed/29953040',
 'https://www.ncbi.nlm.nih.gov/pubmed/29952939',
 'https://www.ncbi.nlm.nih.gov/pubmed/29952780',
 'https://www.ncbi.nlm.nih.gov/pubmed/29952102',
 'https://www.ncbi.nlm.nih.gov/pubmed/29951580',
 'https://www.ncbi.nlm.nih.gov/pubmed/29951448',
 'https://www.ncbi.nlm.nih.gov/pubmed/29951441',
 'https://www.ncbi.nlm.nih.gov/pubmed/29951186',
 'https://www.ncbi.nlm.nih.gov/pubmed/29951032',
 'https://www.ncbi.nlm.nih.gov/pubmed/29951029']

## Main array of links to scrape:

Here we use selenium to iterate through these links. Seleium will click on each link then scrape the title and abstracts on each page. 

In [10]:
# testing scrape_links
for i in scrape_links:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/29955824
https://www.ncbi.nlm.nih.gov/pubmed/29955562
https://www.ncbi.nlm.nih.gov/pubmed/29955532
https://www.ncbi.nlm.nih.gov/pubmed/29955528
https://www.ncbi.nlm.nih.gov/pubmed/29955500
https://www.ncbi.nlm.nih.gov/pubmed/29955495
https://www.ncbi.nlm.nih.gov/pubmed/29955193
https://www.ncbi.nlm.nih.gov/pubmed/29954816
https://www.ncbi.nlm.nih.gov/pubmed/29953689
https://www.ncbi.nlm.nih.gov/pubmed/29953040
https://www.ncbi.nlm.nih.gov/pubmed/29952939
https://www.ncbi.nlm.nih.gov/pubmed/29952780
https://www.ncbi.nlm.nih.gov/pubmed/29952102
https://www.ncbi.nlm.nih.gov/pubmed/29951580
https://www.ncbi.nlm.nih.gov/pubmed/29951448
https://www.ncbi.nlm.nih.gov/pubmed/29951441
https://www.ncbi.nlm.nih.gov/pubmed/29951186
https://www.ncbi.nlm.nih.gov/pubmed/29951032
https://www.ncbi.nlm.nih.gov/pubmed/29951029


### OLD SCRIPT - DONT RUN ###
***The following is nested inside the function above***

In [11]:
# # scrape journal links, delete first link
# links = soup.find_all("p",attrs={'class':'links'})
# links.pop(0)

# # testing to see how my links / journals to scrape
# print(len(links))

# # set empty array to append all links to
# links_all = []

# # loop through links to convert to string
# for i in range (0,19):
#     links_all.append(str(links[i]))
#     print(links[i])
#     print("----------------------------------------------")

# # slice through links_all to test
# len(links_all)
# links_all[1]

# # create empty list to append pubmed_ids to
# pubmed_ids = []

# # loop through links all and use regex to grab the id numbers
# for i in range (len(links_all)):
#     pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))

# # print out info for pubmed_ids
# len(pubmed_ids)
# type(pubmed_ids)
# print(pubmed_ids)

# # use itertools to transform pubmed ids from an array withn an array into one list
# import itertools
# pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))

# # slice through pubmed_merged to see what itertools did
# pubmed_merged[0]

# # concat main_url with a slice of pubmed_merged before we loop
# print(main_url + str(pubmed_merged[0]))

# links_all = []

# for i in range (len(pubmed_merged)):
#     links_all.append(main_url + str(pubmed_merged[i]))

## END OF OLD SCRIPT
<br>
<br>

## Regex Notes

In [12]:
# Regex
# Identifiers:
# \d any number
# \D anything but a number
# \s space
# \S anything but a space
# \w any character
# \W anything but a character
# . any character, except for a newline
# \b the whitespace around words
# \. a period

# Modifiers:
# {1,3} we're expecting 1-3 \d{1-3}
# + Match 1 or more
# ? Match 0 or more
# * Match 0 or more
# $ Match the end of a string
# ^ matching the beginning of a string
# | either or
# [] range or "variance" [A-Za-z] [1-5a-qA-Z]
# {x} expecting "x" amount

# White Space Characters: 
# \n new line
# \s space
# \t tab
# \e escape
# \f form feed
# \r return

# DONT FORGET!:
# . + * [] $ ^ () {} | \

## Selenium
**Web Browser Automation**

In [13]:
from splinter import Browser
from selenium import webdriver

In [14]:
# make sure chrome browser exe is in current directory
# chrome browser exe is not necessary for MACS
executable_path = {'executable_path': 'chromedriver'}

In [15]:
# test with one link
# hard code article_one
article_one = "https://www.ncbi.nlm.nih.gov/pubmed/29953040"

In [16]:
browser = Browser('chrome', headless=False)
html = browser.html
response2 = requests.get(article_one)
soup2 = bs(response2.text, 'html.parser')

In [17]:
browser.visit(article_one)

In [18]:
# scrapes both article title and abstract
test5 = soup2.find("div", attrs={'class': 'rprt_all'}).text.strip()

In [19]:
# there are two 'h1' tags on this page. slice out index 0
title_one = soup2.find_all('h1')

## Article Title

In [20]:
article_one_title = title_one[1].text.strip()
article_one_title

'Therapy With Mesenchymal Stem Cells in Parkinson Disease: History and Perspectives.'

In [21]:
# grabs the entire abstracted nested with the <p> tag
text_main = soup2.find_all('p')

## Abstract

In [22]:
abstract = text_main[9].text.strip()
abstract

'Parkinson disease (PD) is a neurodegenerative disorder affecting the basal nuclei, causing motor and cognitive disorders. Bearing in mind that standard treatments are ineffective in delaying the disease progression, alternative treatments capable of eliminating symptoms and reversing the clinical condition have been sought. Possible alternative treatments include cell therapy, especially with the use of mesenchymal stem cells (MSC).'

## Review Summary

In [23]:
review_summary = text_main[10].text.strip()
review_summary

'MSC are adult stem cells which have demonstrated remarkable therapeutic power in parkinsonian animals due to their differentiation competence, migratory capacity and the production of bioactive molecules. This review aims to analyze the main studies involving MSC and PD in more than a decade of studies, addressing their different methodologies and common characteristics, as well as suggesting perspectives on the application of MSC in PD.'

## Conclusion

In [24]:
conclusion = text_main[11].text.strip()
conclusion

'The results of MSC therapy in animal models and some clinical trials suggest that such cellular therapy may slow the progression of PD and promote neuroregeneration. However, further research is needed to address the limitations of an eventual clinical application.'