In [1]:
# default_exp pubmed

# pubmed

> API details.

In [2]:
#hide
from nbdev.showdoc import *

In [3]:
# export
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests
import re
import random
from pymongo import MongoClient
import pandas as pd
import multiprocessing as mp

In [4]:
URL = 'https://www.ncbi.nlm.nih.gov'

In [5]:
user_agent = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]

### WebDriver

Start WebDriver

* [Install ChromeDriver](https://chromedriver.chromium.org/)


In [6]:
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")

In [7]:
browser = webdriver.Chrome(options=options, executable_path='./chromedriver')

### Database

Create MongoDB container from [image](https://hub.docker.com/_/mongo) and expose to `localhost:32768`

In [8]:
# docker run --name research-mongo -d mongo:latest

In [9]:
MONGO_HOST = 'mongodb://localhost:32768'

In [10]:
client = MongoClient(MONGO_HOST)

In [11]:
# create or load pubmed database
db = client.PubMed

### Utils

In [18]:
#export
def get_max_pages(keywords):
    # build search link
    url = f'{URL}/pubmed/?term={keywords}'
    browser.get(url)
    browser.implicitly_wait(1) # wait to avoid traffic
    
    s = BeautifulSoup(browser.page_source, 'html.parser')
    max_pages = int(s.find('input', {'id': 'pageno2'}).get('last'))
    return max_pages

In [19]:
search_keywords = 'electroencephalogram spectroscopy'

In [20]:
max_pages = get_max_pages(search_keywords); max_pages

197

In [21]:
#export
def request_headers(url):
    return {
            'User-Agent':random.choice(user_agent),
            'Referer': url,
            'Connection':'keep-alive',
            'Host':'www.ncbi.nlm.nih.gov'}

In [22]:
#export
def write_db(data):
    # check if already exists
    if db.pubmed_meta.find({'url': data['url']}).limit(1):
        print(data['url'], 'already exists')
    else:
        db.pubmed_meta.insert_one(data)

In [42]:
def get_orgs(bs):
    try:
        orgs = (bs
        .find('dl',{'class': 'ui-ncbi-toggler-slave'})
        .find_all('dd'))
        return [o.get_text() for o in orgs]
    except:
        return 'ORGANIZATION_NA'

In [43]:
def get_kwords(bs):
    try:
        return (bs
           .find('div', {'class': 'keywords'})
           .find('p')
           .get_text()
           .split())
    except:
        return 'KEYWORDS_NA'

In [44]:
#export
def extract_page(url):
    browser.implicitly_wait(random.randint(2,3))
    html = requests.get(url, headers=request_headers(url))
    bs = BeautifulSoup(html.text, 'html.parser')
    
    title = bs.find('title').get_text()
    authors = bs.find('div', {'class': 'auths'}).get_text()
    orgs = get_orgs(bs)
    kwords = get_kwords(bs)
    cit = bs.find('div', {'class': 'cit'}).get_text()
#     date = re.findall(r'\d{4}\s\w{3}\s\d{2}', citation)[0]
    return dict(title=title, authors=authors, organizations=orgs,
                keywords=kwords, citation=cit, url=url)

In [45]:
#export
def extract_and_write(url):
    data = extract_page(url)
    return write_db(data)

#### TODO: this shoud be multi threaded not processing

In [46]:
#export
def process_many(urls):
    p = mp.Pool()
    for u in urls:
        p.apply_async(extract, args=(u,))
    
    p.close()
    p.join()

## Extract Information

In [47]:
bs = BeautifulSoup(browser.page_source, 'html.parser')

In [48]:
divs = bs.find_all('div', {'class': 'rslt'})

Get author, title, journal and other metadata

In [49]:
urls = []
for d in divs:
    # get paper's link
    u = URL + d.find('a').get('href')
    urls.append(u)

In [54]:
len(urls)

20

In [52]:
data = [extract_page(u) for u in urls]

In [53]:
len(data)

20

In [None]:
browser.implicitly_wait(1)

In [None]:
#     # Click next buttion to navigate to the next page
#     browser.find_element_by_xpath('//*[@title="Next page of results"]').click()

## TODO:

* crawl for g.tect mentions products in papers