## Pub Med Scraper

In [1]:
# scrape dependencies
import requests
import re
from bs4 import BeautifulSoup as bs

# data analysis dependencies
import pandas as pd
import numpy as np
import csv

# ipynb dependencies
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

# viz dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import datetime as dt
import time

In [2]:
# set the url to scrape
url = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease'
print(url)

https://www.ncbi.nlm.nih.gov/pubmed/?term=parkinsons+disease


In [3]:
# set up beautiful soup to scrape
response = requests.get(url)
soup = bs(response.text, 'html.parser')

In [4]:
# lets scrape the article titles
journals = soup.find_all("p", attrs={'class':'title'})

In [5]:
# searching for the journal titles
journals_len = len(journals)
print(f"There are {journals_len} journals to scrape on the first page.")

There are 20 journals to scrape on the first page.


In [6]:
# loop through journals to print titles
for i in range(0,20):
    journals[i].text.strip()

"Trx-1 ameliorates learning and memory deficits in MPTP-induced Parkinson's disease model in mice."

'Lipid vesicles affect the aggregation of 4-hydroxy-2-nonenal-modified α-synuclein oligomers.'

"MANF protects dopamine neurons and locomotion defects from a human α-synuclein induced Parkinson's disease model in C. elegans by regulating ER stress and autophagy pathways."

'Long-term evolution of patient-reported outcome measures in spinocerebellar ataxias.'

'Effects of deep brain stimulation on rest tremor progression in early stage Parkinson disease.'

"Patients' shifting goals for deep brain stimulation and informed consent."

'Glycosaminoglycans have variable effects on α-synuclein aggregation and differentially affect the activities of the resulting amyloid fibrils.'

'Association between attention-deficit/hyperactivity disorder and amyotrophic lateral sclerosis.'

"The factors associated with impulse control behaviors in Parkinson's disease: A 2-year longitudinal retrospective cohort study."

"Evaluation of Linguistic Markers of Word-Finding Difficulty and Cognition in Parkinson's Disease."

'Implementation and evaluation of Parkinson disease management in an outpatient clinical pharmacist-run neurology telephone clinic.'

'Treatment of psychotic symptoms in patients with Parkinson disease.'

'Pimavanserin (Nuplazid™) for the treatment of Parkinson disease psychosis: A review of the literature.'

'Drug-induced parkinsonism: A case report.'

'Evidence for the use of "medical marijuana" in psychiatric and neurologic disorders.'

'Interaction between Monoamine Oxidase B Inhibitors and Selective Serotonin Reuptake Inhibitors.'

'Comparative Study of MRI Biomarkers in the Substantia Nigra to Discriminate Idiopathic Parkinson Disease.'

"Visual hallucinations in dementia and Parkinson's disease: A qualitative exploration of patient and caregiver experiences."

'Therapy With Mesenchymal Stem Cells in Parkinson Disease: History and Perspectives.'

'Level of uric acid and uric acid/creatinine ratios in correlation with stage of Parkinson disease.'

## Set main url to concat with pubmed ids

In [7]:
# set the main url that we will concatanate with the pubmed id
main_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
print(main_url)

https://www.ncbi.nlm.nih.gov/pubmed/


## Function to create array of links to scrape

In [8]:
# set empty links_all list to append to 
links_all = []

# set pubmed ids list to append to
pubmed_ids = []

# set empty list to append scrape_links to
scrape_links = []

# function to get links
def get_links(main_url):
    
    # use bs to scarpe p tags with class - title
    links = soup.find_all("p",attrs={'class':'title'})
      
    # testing to see how my links / journals to scrape
    articles_to_scrape = len(links)
    print(f"There are {articles_to_scrape} articles to scrape.")
    print("----------------------------------------------")
    
    # loop through links to convert to string
    for i in range (len(links)):
        links_all.append(str(links[i]))
        print(links[i])
        print("----------------------------------------------")
        
    # slice through links_all to test
    len(links_all)
    links_all[1]
    
    # loop through links all and use regex to grab the id numbers
    for i in range (len(links_all)):
        pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))
    
    # print out info for pubmed_ids
    len(pubmed_ids)
    type(pubmed_ids)
    print(pubmed_ids)
    print("----------------------------------------------")
    
    # use itertools to transform pubmed ids from an array withn an array into one list
    import itertools
    pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))
    
    # slice through pubmed_merged to see what itertools did
    pubmed_merged[0]
    
    # concat main_url with a slice of pubmed_merged before we loop
    print(main_url + str(pubmed_merged[0]))
    
    # append merged links to links_all
    for i in range (len(pubmed_merged)):
        scrape_links.append(main_url + str(pubmed_merged[i]))

In [9]:
# RUN FUNCTION
get_links(main_url)

There are 20 articles to scrape.
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29960099" ref="ordinalpos=1&amp;ncbi_uid=29960099&amp;link_uid=29960099&amp;linksrc=docsum_title">Trx-1 ameliorates learning and memory deficits in MPTP-induced <b>Parkinson</b>'s <b>disease</b> model in mice.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29960040" ref="ordinalpos=2&amp;ncbi_uid=29960040&amp;link_uid=29960040&amp;linksrc=docsum_title">Lipid vesicles affect the aggregation of 4-hydroxy-2-nonenal-modified α-synuclein oligomers.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29959908" ref="ordinalpos=3&amp;ncbi_uid=29959908&amp;link_uid=29959908&amp;linksrc=docsum_title">MANF protects dopamine neurons and locomotion defects from a human α

There are duplicates in our **scrape_links** array. Use `list` to delete the duplicates.

In [10]:
# delete duplicates in scrape_links and assign to new variable scrape_links_final
scrape_links_final = list(set(scrape_links))
len(scrape_links_final)
scrape_links_final

20

['https://www.ncbi.nlm.nih.gov/pubmed/29955532',
 'https://www.ncbi.nlm.nih.gov/pubmed/29959266',
 'https://www.ncbi.nlm.nih.gov/pubmed/29953689',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955500',
 'https://www.ncbi.nlm.nih.gov/pubmed/29959262',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955562',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955528',
 'https://www.ncbi.nlm.nih.gov/pubmed/29953040',
 'https://www.ncbi.nlm.nih.gov/pubmed/29959225',
 'https://www.ncbi.nlm.nih.gov/pubmed/29952939',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955495',
 'https://www.ncbi.nlm.nih.gov/pubmed/29954816',
 'https://www.ncbi.nlm.nih.gov/pubmed/29959908',
 'https://www.ncbi.nlm.nih.gov/pubmed/29959555',
 'https://www.ncbi.nlm.nih.gov/pubmed/29960099',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955824',
 'https://www.ncbi.nlm.nih.gov/pubmed/29956879',
 'https://www.ncbi.nlm.nih.gov/pubmed/29955193',
 'https://www.ncbi.nlm.nih.gov/pubmed/29960040',
 'https://www.ncbi.nlm.nih.gov/pubmed/29958655']

## Main array of links to scrape:

Here we use selenium to iterate through these links. Seleium will click on each link then scrape the title and abstracts on each page. 

In [11]:
# testing scrape_links
for i in scrape_links_final:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/29955532
https://www.ncbi.nlm.nih.gov/pubmed/29959266
https://www.ncbi.nlm.nih.gov/pubmed/29953689
https://www.ncbi.nlm.nih.gov/pubmed/29955500
https://www.ncbi.nlm.nih.gov/pubmed/29959262
https://www.ncbi.nlm.nih.gov/pubmed/29955562
https://www.ncbi.nlm.nih.gov/pubmed/29955528
https://www.ncbi.nlm.nih.gov/pubmed/29953040
https://www.ncbi.nlm.nih.gov/pubmed/29959225
https://www.ncbi.nlm.nih.gov/pubmed/29952939
https://www.ncbi.nlm.nih.gov/pubmed/29955495
https://www.ncbi.nlm.nih.gov/pubmed/29954816
https://www.ncbi.nlm.nih.gov/pubmed/29959908
https://www.ncbi.nlm.nih.gov/pubmed/29959555
https://www.ncbi.nlm.nih.gov/pubmed/29960099
https://www.ncbi.nlm.nih.gov/pubmed/29955824
https://www.ncbi.nlm.nih.gov/pubmed/29956879
https://www.ncbi.nlm.nih.gov/pubmed/29955193
https://www.ncbi.nlm.nih.gov/pubmed/29960040
https://www.ncbi.nlm.nih.gov/pubmed/29958655


## OLD SCRIPT - DONT RUN 
***The following is nested inside the function above***

In [None]:
# # scrape journal links, delete first link
# links = soup.find_all("p",attrs={'class':'links'})
# links.pop(0)

# # testing to see how my links / journals to scrape
# print(len(links))

# # set empty array to append all links to
# links_all = []

# # loop through links to convert to string
# for i in range (0,19):
#     links_all.append(str(links[i]))
#     print(links[i])
#     print("----------------------------------------------")

# # slice through links_all to test
# len(links_all)
# links_all[1]

# # create empty list to append pubmed_ids to
# pubmed_ids = []

# # loop through links all and use regex to grab the id numbers
# for i in range (len(links_all)):
#     pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))

# # print out info for pubmed_ids
# len(pubmed_ids)
# type(pubmed_ids)
# print(pubmed_ids)

# # use itertools to transform pubmed ids from an array withn an array into one list
# import itertools
# pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))

# # slice through pubmed_merged to see what itertools did
# pubmed_merged[0]

# # concat main_url with a slice of pubmed_merged before we loop
# print(main_url + str(pubmed_merged[0]))

# links_all = []

# for i in range (len(pubmed_merged)):
#     links_all.append(main_url + str(pubmed_merged[i]))

## END OF OLD SCRIPT 
<br>
<br>

## Regex Notes

In [None]:
# Regex
# Identifiers:
# \d any number
# \D anything but a number
# \s space
# \S anything but a space
# \w any character
# \W anything but a character
# . any character, except for a newline
# \b the whitespace around words
# \. a period

# Modifiers:
# {1,3} we're expecting 1-3 \d{1-3}
# + Match 1 or more
# ? Match 0 or more
# * Match 0 or more
# $ Match the end of a string
# ^ matching the beginning of a string
# | either or
# [] range or "variance" [A-Za-z] [1-5a-qA-Z]
# {x} expecting "x" amount

# White Space Characters: 
# \n new line
# \s space
# \t tab
# \e escape
# \f form feed
# \r return

# DONT FORGET!:
# . + * [] $ ^ () {} | \

## Selenium
**Web Browser Automation**

In [12]:
from splinter import Browser
from selenium import webdriver

In [13]:
# make sure chrome browser exe is in current directory
# chrome browser exe is not necessary for MACS
executable_path = {'executable_path': 'chromedriver'}

In [14]:
# test with one link
# hard code article_one
article_one = scrape_links_final[0]

### start of original bs scrape script - DONT RUN

In [50]:
# # script to set up bs scraper
# browser = Browser('chrome', headless=False)
# html = browser.html
# response2 = requests.get(article_one)
# soup2 = bs(response2.text, 'html.parser')
# browser.visit(article_one)

### ^ end of original bs scrape script

## Set up dictionary to append data to

In [27]:
article_dict = {"title": [],
               "abstract": []}

## Create get title function

In [51]:
title = []
abstract = []

def get_title(article_one):
    
    # sets up scraper
    browser = Browser('chrome', headless=False)
    html = browser.html
    response2 = requests.get(article_one)
    soup2 = bs(response2.text, 'html.parser')
    
    browser.visit(article_one)
    
    # there are two 'h1' tags on this page. slice out index 0
    title_one = soup2.find_all('h1')
    article_one_title = title_one[1].text.strip()
    
    # slice h1 at index 1 to grab article title
    title.append(article_one_title)
    
    # get abstract
    abstract.append(soup2.find("div", attrs={'class': 'rprt_all'}).text.strip())
    
    # return title and abstract
    # return title
    print (title)
    print('\n')
    print(abstract)

## Run get_title function

In [52]:
get_title(article_one)

['Treatment of psychotic symptoms in patients with Parkinson disease.']


["Ment Health Clin. 2018 Mar 23;7(6):262-270. doi: 10.9740/mhc.2017.11.262. eCollection  2017 Nov.Treatment of psychotic symptoms in patients with Parkinson disease.Chen JJ1.Author information1Professor and Chair, Department of Pharmacy Practice, College of Pharmacy, Marshall B. Ketchum University, Fullerton, California; Professor, Department of Neurology, Loma Linda University, Loma Linda, California, jchen@ketchum.edu.AbstractPersistent psychotic symptoms will develop in up to 60% of patients with Parkinson disease (PD). The initial approach to the management of PD psychosis (PDP) begins with addressing concurrent systemic conditions associated with psychotic behavior, such as delirium, medical conditions (eg, infections), psychiatric disorders (eg, major depression with psychotic symptoms, mania, schizophrenia), and substance misuse or withdrawal. A review of current medications is recommended, and medications

In [58]:
print(title)

['Treatment of psychotic symptoms in patients with Parkinson disease.']


In [59]:
print(abstract)

["Ment Health Clin. 2018 Mar 23;7(6):262-270. doi: 10.9740/mhc.2017.11.262. eCollection  2017 Nov.Treatment of psychotic symptoms in patients with Parkinson disease.Chen JJ1.Author information1Professor and Chair, Department of Pharmacy Practice, College of Pharmacy, Marshall B. Ketchum University, Fullerton, California; Professor, Department of Neurology, Loma Linda University, Loma Linda, California, jchen@ketchum.edu.AbstractPersistent psychotic symptoms will develop in up to 60% of patients with Parkinson disease (PD). The initial approach to the management of PD psychosis (PDP) begins with addressing concurrent systemic conditions associated with psychotic behavior, such as delirium, medical conditions (eg, infections), psychiatric disorders (eg, major depression with psychotic symptoms, mania, schizophrenia), and substance misuse or withdrawal. A review of current medications is recommended, and medications that may trigger psychotic symptoms should be eliminated. If possible, an

## Add title and abstract to article_dict

In [55]:
article_dict["title"].append(title)
article_dict["abstract"].append(abstract)

In [57]:
print(article_dict)

{'title': [['Treatment of psychotic symptoms in patients with Parkinson disease.']], 'abstract': [["Ment Health Clin. 2018 Mar 23;7(6):262-270. doi: 10.9740/mhc.2017.11.262. eCollection  2017 Nov.Treatment of psychotic symptoms in patients with Parkinson disease.Chen JJ1.Author information1Professor and Chair, Department of Pharmacy Practice, College of Pharmacy, Marshall B. Ketchum University, Fullerton, California; Professor, Department of Neurology, Loma Linda University, Loma Linda, California, jchen@ketchum.edu.AbstractPersistent psychotic symptoms will develop in up to 60% of patients with Parkinson disease (PD). The initial approach to the management of PD psychosis (PDP) begins with addressing concurrent systemic conditions associated with psychotic behavior, such as delirium, medical conditions (eg, infections), psychiatric disorders (eg, major depression with psychotic symptoms, mania, schizophrenia), and substance misuse or withdrawal. A review of current medications is reco

<br>
<br>
<br>
<br>
<br>

## Older scripts

<br>
<br>
<br>

# Older section to scrape abstract

In [20]:
# grabs the entire abstracted nested with the <p> tag
text_main = soup2.find_all('p')

## Abstract

In [21]:
abstract = text_main[9].text.strip()
abstract

'Persistent psychotic symptoms will develop in up to 60% of patients with Parkinson disease (PD). The initial approach to the management of PD psychosis (PDP) begins with addressing concurrent systemic conditions associated with psychotic behavior, such as delirium, medical conditions (eg, infections), psychiatric disorders (eg, major depression with psychotic symptoms, mania, schizophrenia), and substance misuse or withdrawal. A review of current medications is recommended, and medications that may trigger psychotic symptoms should be eliminated. If possible, antiparkinson medications should be reduced to the minimum therapeutic dose or discontinued in a sequential manner. Generally, dose reduction or discontinuation of anticholinergics is attempted first, followed by that of monoamine oxidase B inhibitors, amantadine, dopamine agonists, catechol-O-methyltransferase inhibitors, and lastly carbidopa/levodopa. The aim of antiparkinson medication dose reduction is to achieve a balance be

## Review Summary

In [22]:
review_summary = text_main[10].text.strip()
review_summary

'Parkinson disease; antipsychotics; clozapine; movement disorders; nonmotor symptoms; pimavanserin; psychosis; quetiapine'

## Conclusion

In [23]:
conclusion = text_main[11].text.strip()
conclusion

"Disclosures: J.J.C. is on the Speaker's Bureau for ACADIA Pharmaceuticals. Psychopharmacology Pearls are review articles intended to highlight both the evidence base available and/or controversial areas of clinical care for psychiatric and neurologic conditions, as well as strategies of clinical decision-making used by expert clinicians. As pearls, articles reflect the views and practice of each author as substantiated with evidence-based facts as well as opinion and experience. Articles are edited by members of the Psychopharmacology Pearls Editorial Board and are peer reviewed by MHC reviewers. This article was developed as part of the 2017 Psychopharmacology Pearls product for BCPP recertification credit. The course information and testing center is at cpnp.org/322903."