## Pub Med Scraper

In [1]:
# scrape dependencies
import requests
import re
from bs4 import BeautifulSoup as bs

# data analysis dependencies
import pandas as pd
import numpy as np
import csv

# ipynb dependencies
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

# viz dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import datetime as dt
import time

In [3]:
# set the url to scrape
url = 'https://www.ncbi.nlm.nih.gov/pubmed/trending/'
print(url)

https://www.ncbi.nlm.nih.gov/pubmed/trending/


In [4]:
# set up beautiful soup to scrape
response = requests.get(url)
soup = bs(response.text, 'html.parser')

In [5]:
# lets scrape the article titles
journals = soup.find_all("p", attrs={'class':'title'})

In [6]:
# searching for the journal titles
journals_len = len(journals)
print(f"There are {journals_len} journals to scrape on the first page.")

There are 20 journals to scrape on the first page.


In [7]:
# loop through journals to print titles
for i in range(journals_len):
    journals[i].text.strip()

'Author Correction: What models eat.'

'Physical and Mental Effects of Bathing: A Randomized Intervention Study.'

'Günter Blobel (1936–2018)'

'Enhancing T cell therapy through TCR-signaling-responsive nanoparticle drug delivery.'

'Cancer-associated thrombosis in patients with implanted ports: a prospective multicenter French cohort study (ONCOCIP).'

'Long-acting Reversible Contraception-Highly Efficacious, Safe, and Underutilized.'

'Chimeric Antigen Receptor Therapy.'

'The Link between the Appendix and Ulcerative Colitis: Clinical Relevance and Potential Immunological Mechanisms.'

'Ketorolac for postoperative pain in children.'

'Human pluripotent reprogramming with CRISPR activators.'

'Validation of Plasma Biomarker Candidates for the Prediction of eGFR Decline in Patients With Type 2 Diabetes.'

'Optimized base editors enable efficient editing in cells, organoids and mice.'

'Author Correction: Reduced mutation rate in exons due to differential mismatch repair.'

'Species-specific activity of antibacterial drug combinations.'

'Heavily and fully modified RNAs guide efficient SpyCas9-mediated genome editing.'

'CRISPR screens identify genomic ribonucleotides as a source of PARP-trapping lesions.'

'Patisiran, an RNAi Therapeutic, for Hereditary Transthyretin Amyloidosis.'

'DNA-induced liquid phase condensation of cGAS activates innate immune signaling.'

'A novel chemical compound SINCRO with dual function in the STING-type I interferon and tumor cell death pathways.'

'Publisher Correction: A naturally occurring antiviral ribonucleotide encoded by the human genome.'

## Set main url to concat with pubmed ids

In [8]:
# set the main url that we will concatanate with the pubmed id
main_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
print(main_url)

https://www.ncbi.nlm.nih.gov/pubmed/


<br>
<br>
# TESTING SECTION ------------------------------------------------------------------

In [9]:
# set empty links_all list to append to 
links_all = []

# set pubmed ids list to append to
pubmed_ids = []

# set empty list to append scrape_links to
scrape_links = []

In [10]:
# use bs to scarpe p tags with class - title
links = soup.find_all("p",attrs={'class':'title'})
print(links[0])

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29985486" ref="ordinalpos=1&amp;ncbi_uid=29985486&amp;link_uid=29985486&amp;linksrc=docsum_title">Author Correction: What models eat.</a></p>


In [11]:
# testing to see how my links / journals to scrape
articles_to_scrape = len(links)
print(f"There are {articles_to_scrape} articles to scrape.")
print("----------------------------------------------")

There are 20 articles to scrape.
----------------------------------------------


In [12]:
# loop through links to convert to string
for i in range (len(links)):
    links_all.append(str(links[i]))
    print(links[i])
    print("----------------------------------------------")

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29985486" ref="ordinalpos=1&amp;ncbi_uid=29985486&amp;link_uid=29985486&amp;linksrc=docsum_title">Author Correction: What models eat.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29977318" ref="ordinalpos=2&amp;ncbi_uid=29977318&amp;link_uid=29977318&amp;linksrc=docsum_title">Physical and Mental Effects of Bathing: A Randomized Intervention Study.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29975496" ref="ordinalpos=3&amp;ncbi_uid=29975496&amp;link_uid=29975496&amp;linksrc=docsum_title">Günter Blobel (1936–2018)</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29985479" ref="ordinalpos=4&amp;ncbi_uid=29985479&amp;link_uid=29985479&amp;linksr

In [None]:
# for i in links_all:
#     print(i)
#     print("----------------------------------------------")

In [13]:
# slice through links_all to test
len(links_all)
links_all[0]

20

'<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29985486" ref="ordinalpos=1&amp;ncbi_uid=29985486&amp;link_uid=29985486&amp;linksrc=docsum_title">Author Correction: What models eat.</a></p>'

### Regex test

In [14]:
regex_test = '<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29985486" ref="ordinalpos=1&amp;ncbi_uid=29985486&amp;link_uid=29985486&amp;linksrc=docsum_title">Author Correction: What models eat.</a></p>'

In [15]:
print(regex_test)

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29985486" ref="ordinalpos=1&amp;ncbi_uid=29985486&amp;link_uid=29985486&amp;linksrc=docsum_title">Author Correction: What models eat.</a></p>


In [16]:
re.findall(r'\d{8}',regex_test)

['29985486', '29985486', '29985486']

### End Regex Test

In [17]:
# loop through links all and use regex to grab the id numbers
for i in range (len(links_all)):
    pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))

In [18]:
# print out info for pubmed_ids
len(pubmed_ids)
type(pubmed_ids)
print(pubmed_ids)
print("----------------------------------------------")

20

list

[['29985486', '29985486', '29985486'], ['29977318', '29977318', '29977318'], ['29975496', '29975496', '29975496'], ['29985479', '29985479', '29985479'], ['29980524', '29980524', '29980524'], ['29984374', '29984374', '29984374'], ['29972754', '29972754', '29972754'], ['26416189', '26416189', '26416189'], ['29981164', '29981164', '29981164'], ['29980666', '29980666', '29980666'], ['29980527', '29980527', '29980527'], ['29969439', '29969439', '29969439'], ['29973711', '29973711', '29973711'], ['29973719', '29973719', '29973719'], ['29980686', '29980686', '29980686'], ['29973717', '29973717', '29973717'], ['29972753', '29972753', '29972753'], ['29976794', '29976794', '29976794'], ['29981256', '29981256', '29981256'], ['29980769', '29980769', '29980769']]
----------------------------------------------


In [19]:
# use itertools to transform pubmed ids from an array withn an array into one list
import itertools
pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))

In [20]:
# slice through pubmed_merged to see what itertools did
pubmed_merged[0]
print(pubmed_merged)

'29985486'

['29985486', '29985486', '29985486', '29977318', '29977318', '29977318', '29975496', '29975496', '29975496', '29985479', '29985479', '29985479', '29980524', '29980524', '29980524', '29984374', '29984374', '29984374', '29972754', '29972754', '29972754', '26416189', '26416189', '26416189', '29981164', '29981164', '29981164', '29980666', '29980666', '29980666', '29980527', '29980527', '29980527', '29969439', '29969439', '29969439', '29973711', '29973711', '29973711', '29973719', '29973719', '29973719', '29980686', '29980686', '29980686', '29973717', '29973717', '29973717', '29972753', '29972753', '29972753', '29976794', '29976794', '29976794', '29981256', '29981256', '29981256', '29980769', '29980769', '29980769']


In [21]:
# concat main_url with a slice of pubmed_merged before we loop
print(main_url + str(pubmed_merged[0]))

https://www.ncbi.nlm.nih.gov/pubmed/29985486


In [22]:
# append merged links to links_all
for i in range (len(pubmed_merged)):
    scrape_links.append(main_url + str(pubmed_merged[i]))

In [23]:
for i in scrape_links:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/29985486
https://www.ncbi.nlm.nih.gov/pubmed/29985486
https://www.ncbi.nlm.nih.gov/pubmed/29985486
https://www.ncbi.nlm.nih.gov/pubmed/29977318
https://www.ncbi.nlm.nih.gov/pubmed/29977318
https://www.ncbi.nlm.nih.gov/pubmed/29977318
https://www.ncbi.nlm.nih.gov/pubmed/29975496
https://www.ncbi.nlm.nih.gov/pubmed/29975496
https://www.ncbi.nlm.nih.gov/pubmed/29975496
https://www.ncbi.nlm.nih.gov/pubmed/29985479
https://www.ncbi.nlm.nih.gov/pubmed/29985479
https://www.ncbi.nlm.nih.gov/pubmed/29985479
https://www.ncbi.nlm.nih.gov/pubmed/29980524
https://www.ncbi.nlm.nih.gov/pubmed/29980524
https://www.ncbi.nlm.nih.gov/pubmed/29980524
https://www.ncbi.nlm.nih.gov/pubmed/29984374
https://www.ncbi.nlm.nih.gov/pubmed/29984374
https://www.ncbi.nlm.nih.gov/pubmed/29984374
https://www.ncbi.nlm.nih.gov/pubmed/29972754
https://www.ncbi.nlm.nih.gov/pubmed/29972754
https://www.ncbi.nlm.nih.gov/pubmed/29972754
https://www.ncbi.nlm.nih.gov/pubmed/26416189
https://ww

# END TESTING SECTION------------------------------------------------------------
<br>
<br>

## Function to create array of links to scrape

In [None]:
# set empty links_all list to append to 
links_all = []

# set pubmed ids list to append to
pubmed_ids = []

# set empty list to append scrape_links to
scrape_links = []

# function to get links
def get_links(main_url):
    
    # use bs to scarpe p tags with class - title
    links = soup.find_all("p",attrs={'class':'title'})
      
    # testing to see how my links / journals to scrape
    articles_to_scrape = len(links)
    print(f"There are {articles_to_scrape} articles to scrape.")
    print("----------------------------------------------")
    
    # loop through links to convert to string
    for i in range (len(links)):
        links_all.append(str(links[i]))
        print(links[i])
        print("----------------------------------------------")
        
    # slice through links_all to test
    len(links_all)
    links_all[1]
    
    # loop through links all and use regex to grab the id numbers
    for i in range (len(links_all)):
        pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))
    
    # print out info for pubmed_ids
    len(pubmed_ids)
    type(pubmed_ids)
    print(pubmed_ids)
    print("----------------------------------------------")
    
    # use itertools to transform pubmed ids from an array withn an array into one list
    import itertools
    pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))
    
    # slice through pubmed_merged to see what itertools did
    pubmed_merged[0]
    
    # concat main_url with a slice of pubmed_merged before we loop
    print(main_url + str(pubmed_merged[0]))
    
    # append merged links to links_all
    for i in range (len(pubmed_merged)):
        scrape_links.append(main_url + str(pubmed_merged[i]))

In [None]:
# RUN FUNCTION
get_links(main_url)

<br>
There are duplicates in our **scrape_links** array. Use `list` to delete the duplicates.

In [24]:
# delete duplicates in scrape_links and assign to new variable scrape_links_final
scrape_links_final = list(set(scrape_links))
len(scrape_links_final)
scrape_links_final

20

['https://www.ncbi.nlm.nih.gov/pubmed/26416189',
 'https://www.ncbi.nlm.nih.gov/pubmed/29981164',
 'https://www.ncbi.nlm.nih.gov/pubmed/29980527',
 'https://www.ncbi.nlm.nih.gov/pubmed/29976794',
 'https://www.ncbi.nlm.nih.gov/pubmed/29972753',
 'https://www.ncbi.nlm.nih.gov/pubmed/29973717',
 'https://www.ncbi.nlm.nih.gov/pubmed/29985479',
 'https://www.ncbi.nlm.nih.gov/pubmed/29980769',
 'https://www.ncbi.nlm.nih.gov/pubmed/29973711',
 'https://www.ncbi.nlm.nih.gov/pubmed/29985486',
 'https://www.ncbi.nlm.nih.gov/pubmed/29980524',
 'https://www.ncbi.nlm.nih.gov/pubmed/29981256',
 'https://www.ncbi.nlm.nih.gov/pubmed/29980686',
 'https://www.ncbi.nlm.nih.gov/pubmed/29977318',
 'https://www.ncbi.nlm.nih.gov/pubmed/29972754',
 'https://www.ncbi.nlm.nih.gov/pubmed/29980666',
 'https://www.ncbi.nlm.nih.gov/pubmed/29984374',
 'https://www.ncbi.nlm.nih.gov/pubmed/29975496',
 'https://www.ncbi.nlm.nih.gov/pubmed/29973719',
 'https://www.ncbi.nlm.nih.gov/pubmed/29969439']

## Main array of links to scrape:

Here we use selenium to iterate through these links. Seleium will click on each link then scrape the title and abstracts on each page. 

In [25]:
# testing scrape_links
for i in scrape_links_final:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/26416189
https://www.ncbi.nlm.nih.gov/pubmed/29981164
https://www.ncbi.nlm.nih.gov/pubmed/29980527
https://www.ncbi.nlm.nih.gov/pubmed/29976794
https://www.ncbi.nlm.nih.gov/pubmed/29972753
https://www.ncbi.nlm.nih.gov/pubmed/29973717
https://www.ncbi.nlm.nih.gov/pubmed/29985479
https://www.ncbi.nlm.nih.gov/pubmed/29980769
https://www.ncbi.nlm.nih.gov/pubmed/29973711
https://www.ncbi.nlm.nih.gov/pubmed/29985486
https://www.ncbi.nlm.nih.gov/pubmed/29980524
https://www.ncbi.nlm.nih.gov/pubmed/29981256
https://www.ncbi.nlm.nih.gov/pubmed/29980686
https://www.ncbi.nlm.nih.gov/pubmed/29977318
https://www.ncbi.nlm.nih.gov/pubmed/29972754
https://www.ncbi.nlm.nih.gov/pubmed/29980666
https://www.ncbi.nlm.nih.gov/pubmed/29984374
https://www.ncbi.nlm.nih.gov/pubmed/29975496
https://www.ncbi.nlm.nih.gov/pubmed/29973719
https://www.ncbi.nlm.nih.gov/pubmed/29969439


### Add timer to main function when scraping

In [26]:
# slice out scrape_links_final so we can scrape 5 articles at a time
links_1 = scrape_links_final[0:5]
links_1

links_2 = scrape_links_final[5:10]
links_2

links_3 = scrape_links_final[10:15]
links_3

links_4 = scrape_links_final[15:20]
links_4

['https://www.ncbi.nlm.nih.gov/pubmed/26416189',
 'https://www.ncbi.nlm.nih.gov/pubmed/29981164',
 'https://www.ncbi.nlm.nih.gov/pubmed/29980527',
 'https://www.ncbi.nlm.nih.gov/pubmed/29976794',
 'https://www.ncbi.nlm.nih.gov/pubmed/29972753']

['https://www.ncbi.nlm.nih.gov/pubmed/29973717',
 'https://www.ncbi.nlm.nih.gov/pubmed/29985479',
 'https://www.ncbi.nlm.nih.gov/pubmed/29980769',
 'https://www.ncbi.nlm.nih.gov/pubmed/29973711',
 'https://www.ncbi.nlm.nih.gov/pubmed/29985486']

['https://www.ncbi.nlm.nih.gov/pubmed/29980524',
 'https://www.ncbi.nlm.nih.gov/pubmed/29981256',
 'https://www.ncbi.nlm.nih.gov/pubmed/29980686',
 'https://www.ncbi.nlm.nih.gov/pubmed/29977318',
 'https://www.ncbi.nlm.nih.gov/pubmed/29972754']

['https://www.ncbi.nlm.nih.gov/pubmed/29980666',
 'https://www.ncbi.nlm.nih.gov/pubmed/29984374',
 'https://www.ncbi.nlm.nih.gov/pubmed/29975496',
 'https://www.ncbi.nlm.nih.gov/pubmed/29973719',
 'https://www.ncbi.nlm.nih.gov/pubmed/29969439']

## Regex Notes

In [None]:
# Regex
# Identifiers:
# \d any number
# \D anything but a number
# \s space
# \S anything but a space
# \w any character
# \W anything but a character
# . any character, except for a newline
# \b the whitespace around words
# \. a period

# Modifiers:
# {1,3} we're expecting 1-3 \d{1-3}
# + Match 1 or more
# ? Match 0 or more
# * Match 0 or more
# $ Match the end of a string
# ^ matching the beginning of a string
# | either or
# [] range or "variance" [A-Za-z] [1-5a-qA-Z]
# {x} expecting "x" amount

# White Space Characters: 
# \n new line
# \s space
# \t tab
# \e escape
# \f form feed
# \r return

# DONT FORGET!:
# . + * [] $ ^ () {} | \

## Selenium
**Web Browser Automation**

In [27]:
from splinter import Browser
from selenium import webdriver

In [28]:
# make sure chrome browser exe is in current directory
# chrome browser exe is not necessary for MACS
executable_path = {'executable_path': 'chromedriver'}

## Set up dictionary to append data to

In [29]:
article_dict = {"title": [],
               "abstract": []}

## Create get_article_info function

In [30]:
title = []
abstract = []

def get_article_info(links_1):
    
    # iterate through articles
    for i in links_1:
        
        # sets up scraper
        browser = Browser('chrome', headless=False)
        html = browser.html
        response2 = requests.get(i)
        soup2 = bs(response2.text, 'html.parser')
    
        browser.visit(i)
    
        # there are two 'h1' tags on this page. slice out index 0
        title_one = soup2.find_all('h1')
        article_one_title = title_one[1].text.strip()
    
        # slice h1 at index 1 to grab article title
        title.append(article_one_title)
    
        # get abstract 
        # abstract.append(soup2.find("div", attrs={'class': 'rprt_all'}).text.strip())     
        abstract.append(soup2.find("div", attrs={'class': 'rprt abstract'}).text.strip())

## Run article_info function

In [31]:
get_article_info(links_1)

In [35]:
get_article_info(links_2)

In [38]:
get_article_info(links_3)

In [41]:
get_article_info(links_4)

In [42]:
for i in title:
    print(i + "\n")

len(title)

The Link between the Appendix and Ulcerative Colitis: Clinical Relevance and Potential Immunological Mechanisms.

Ketorolac for postoperative pain in children.

Validation of Plasma Biomarker Candidates for the Prediction of eGFR Decline in Patients With Type 2 Diabetes.

DNA-induced liquid phase condensation of cGAS activates innate immune signaling.

Patisiran, an RNAi Therapeutic, for Hereditary Transthyretin Amyloidosis.

CRISPR screens identify genomic ribonucleotides as a source of PARP-trapping lesions.

Enhancing T cell therapy through TCR-signaling-responsive nanoparticle drug delivery.

Publisher Correction: A naturally occurring antiviral ribonucleotide encoded by the human genome.

Author Correction: Reduced mutation rate in exons due to differential mismatch repair.

Author Correction: What models eat.

Cancer-associated thrombosis in patients with implanted ports: a prospective multicenter French cohort study (ONCOCIP).

A novel chemical compound SINCRO with dual function

20

In [43]:
for i in abstract:
    print(i)
    print("\n")

len(abstract)

Am J Gastroenterol. 2016 Feb;111(2):163-9. doi: 10.1038/ajg.2015.301. Epub  2015 Sep 29.The Link between the Appendix and Ulcerative Colitis: Clinical Relevance and Potential Immunological Mechanisms.Sahami S1, Kooij IA2, Meijer SL3, Van den Brink GR2,4, Buskens CJ1, Te Velde AA2.Author information1Department of Surgery, Academic Medical Centre, Amsterdam, The Netherlands.2Tytgat Institute for Liver and Intestinal Research, Academic Medical Center, Amsterdam, The Netherlands.3Department of Pathology, Academic Medical Center, Amsterdam, The Netherlands.4Department of Gastroenterology, Academic Medical Center, Amsterdam, The Netherlands.AbstractThe human appendix has long been considered as a vestigial organ, an organ that has lost its function during evolution. In recent years, however, reports have emerged that link the appendix to numerous immunological functions in humans. Evidence has been presented for an important role of the appendix in maintaining intestinal health. This theory 

20

## Add title and abstract to article_dict

In [44]:
article_dict["title"].append(title)
article_dict["abstract"].append(abstract)
print(article_dict)

{'title': [['The Link between the Appendix and Ulcerative Colitis: Clinical Relevance and Potential Immunological Mechanisms.', 'Ketorolac for postoperative pain in children.', 'Validation of Plasma Biomarker Candidates for the Prediction of eGFR Decline in Patients With Type 2 Diabetes.', 'DNA-induced liquid phase condensation of cGAS activates innate immune signaling.', 'Patisiran, an RNAi Therapeutic, for Hereditary Transthyretin Amyloidosis.', 'CRISPR screens identify genomic ribonucleotides as a source of PARP-trapping lesions.', 'Enhancing T cell therapy through TCR-signaling-responsive nanoparticle drug delivery.', 'Publisher Correction: A naturally occurring antiviral ribonucleotide encoded by the human genome.', 'Author Correction: Reduced mutation rate in exons due to differential mismatch repair.', 'Author Correction: What models eat.', 'Cancer-associated thrombosis in patients with implanted ports: a prospective multicenter French cohort study (ONCOCIP).', 'A novel chemical

## Save article_dict to json

In [45]:
import json

In [46]:
json = json.dumps(article_dict)
f = open("trending1.json","w")
f.write(json)
f.close()

63627

<br>
<br>
<br>