In [None]:
# default_exp pubmed

# pubmed

> API details.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# export
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests
import re
import random
from pymongo import MongoClient
import pandas as pd
from threading import Thread
from tqdm.notebook import tqdm

In [None]:
URL = 'https://www.ncbi.nlm.nih.gov'

In [None]:
user_agent = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]

### WebDriver

Start WebDriver

* [Install ChromeDriver](https://chromedriver.chromium.org/)


In [None]:
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")

In [None]:
browser = webdriver.Chrome(options=options, executable_path='./chromedriver')

In [None]:
browser.page_source

'<html><head></head><body></body></html>'

### Database

Create MongoDB container from [image](https://hub.docker.com/_/mongo) and expose to `localhost:27017`

In [None]:
MONGO_HOST = 'mongodb://localhost:27017'

In [None]:
client = MongoClient(MONGO_HOST)

In [None]:
# create or load pubmed database
db = client.PubMed

### Utils

In [None]:
#export
def get_max_pages(keywords):
    # build search link
    url = f'{URL}/pubmed/?term={keywords}'
    browser.get(url)
    browser.implicitly_wait(1) # wait to avoid traffic
    
    s = BeautifulSoup(browser.page_source, 'html.parser')
    max_pages = int(s.find('input', {'id': 'pageno2'}).get('last'))
    return max_pages

In [None]:
search_keywords = ['electroencephalogram',
                   'near infrared spectroscopy',
                   'transcranial electrical stimulation',
                   'transcranial magnetic stimulation',
                   'cognitive assessment',
                   'magnetic resonance']

In [None]:
[get_max_pages(kw) for kw in search_keywords]

[8133, 1146, 340, 861, 2951, 38015]

#### Headers

In [None]:
#export
def request_headers(url):
    return {
            'User-Agent':random.choice(user_agent),
            'Referer': url,
            'Connection':'keep-alive',
            'Host':'www.ncbi.nlm.nih.gov'}

#### Mongo DB

In [None]:
#export
def write_db(data):
    # check if already exists
#     if db.pubmed_meta.find({'url': data['url']}).limit(1):
#         print(data['url'], 'already exists')
#     else:
    db.pubmed_meta.insert_one(data)

#### Extractors

In [None]:
def get_orgs(bs):
    try:
        orgs = (bs
        .find('dl',{'class': 'ui-ncbi-toggler-slave'})
        .find_all('dd'))
        return [o.get_text() for o in orgs]
    except:
        return 'ORGANIZATION_NA'

In [None]:
def get_kwords(bs):
    try:
        return (bs
           .find('div', {'class': 'keywords'})
           .find('p')
           .get_text()
           .split())
    except:
        return 'KEYWORDS_NA'

In [None]:
#export
def extract_page(url):
    browser.implicitly_wait(random.randint(2,3))
    html = requests.get(url, headers=request_headers(url))
    bs = BeautifulSoup(html.text, 'html.parser')
    
    title = bs.find('title').get_text()
    authors = bs.find('div', {'class': 'auths'}).get_text()
    orgs = get_orgs(bs)
    kwords = get_kwords(bs)
    cit = bs.find('div', {'class': 'cit'}).get_text()
#     date = re.findall(r'\d{4}\s\w{3}\s\d{2}', citation)[0]
    return dict(title=title, authors=authors, organizations=orgs,
                keywords=kwords, citation=cit, url=url)

In [None]:
#export
def extract_and_write(url, kw):
    data = extract_page(url)
    data['keyword'] = kw
    return write_db(data)

#### TODO: this shoud be multi threaded not processing

## Crawler

In [None]:
#export
def crawl_list(page, kw):
    bs = BeautifulSoup(page, 'html.parser')
    divs = bs.find_all('div', {'class': 'rslt'})
    
    def crawl(d):
        u = URL + d.find('a').get('href')
        extract_and_write(u, kw)
    
    for d in divs:
        t = Thread(target=crawl, args=(d,))
        t.start()


In [None]:
#export
def crawl_pubmed(keywords, start_page=0):
    for kw in tqdm(keywords):
        url = f'{URL}/pubmed/?term={kw}'
        browser.get(url)
        max_pages = get_max_pages(kw)
        
        if start_page > 0:
            # if not starting from first page
            for _ in tqdm(range(start_page)):
                browser.find_element_by_xpath('//*[@title="Next page of results"]').click()
        
        for _ in tqdm(range(start_page, max_pages-1)):
            crawl_list(browser.page_source, kw)
            browser.implicitly_wait(1)
            # Click next buttion to navigate to the next page
            browser.find_element_by_xpath('//*[@title="Next page of results"]').click()

In [None]:
get_max_pages(search_keywords[5])

38015

In [None]:
keywords = search_keywords[5:]; keywords

['magnetic resonance']

In [None]:
# start_page = 162_594 // 20; start_page

In [None]:
crawl_pubmed(keywords)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=38014.0), HTML(value='')))

Exception in thread Thread-8:
Traceback (most recent call last):
  File "/Users/markyousef/anaconda3/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/Users/markyousef/anaconda3/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-21-a8b7c000dc57>", line 8, in crawl
    extract_and_write(u, kw)
  File "<ipython-input-20-9e0ff25acdd7>", line 3, in extract_and_write
    data = extract_page(url)
  File "<ipython-input-19-fb2f64b408a5>", line 11, in extract_page
    cit = bs.find('div', {'class': 'cit'}).get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'
Exception in thread Thread-6:
Traceback (most recent call last):
  File "/Users/markyousef/anaconda3/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/Users/markyousef/anaconda3/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@title="Next page of results"]"}
  (Session info: headless chrome=80.0.3987.122)


## TODO:

* crawl for g.tect mentions products in papers