In [None]:
# default_exp core

# module name here

> API details.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# export
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import re
import random
from pymongo import MongoClient
import pandas as pd
import multiprocessing as mp

In [None]:
URL = 'https://www.ncbi.nlm.nih.gov'

In [None]:
user_agent = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]

### WebDriver

Start WebDriver

* [Install ChromeDriver](https://chromedriver.chromium.org/)


In [None]:
browser = webdriver.Chrome('./chromedriver')

### Database

Create MongoDB container from [image](https://hub.docker.com/_/mongo) and expose to `localhost:32768`

In [None]:
MONGO_HOST = 'mongodb://localhost:32768'

In [None]:
client = MongoClient(MONGO_HOST)

In [None]:
# create or load pubmed database
db = client.PubMed

### Utils

In [None]:
def get_max_pages(keywords):
    # build search link
    url = f'{URL}/pubmed/?term={keywords}'
    browser.get(url)
    browser.implicitly_wait(1) # wait to avoid traffic
    
    s = BeautifulSoup(browser.page_source, 'lxml')
    max_pages = int(s.find('input', {'id': 'pageno2'}).get('last'))
    return max_pages

In [None]:
search_keywords = 'electroencephalogram spectroscopy'

In [None]:
max_pages = get_max_pages(search_keywords); max_pages

197

In [None]:
def request_headers(url):
    return {
            'User-Agent':random.choice(user_agent),
            'Referer': url,
            'Connection':'keep-alive',
            'Host':'www.ncbi.nlm.nih.gov'}

In [None]:
def write_db(data):
    # check if already exists
    if db.pubmed_meta.find({'url': data['url']}).limit(1):
        print(data['url'], 'already exists')
    else:
        db.pubmed_meta.insert_one(data)

In [None]:
def extract_page(url):
    browser.implicitly_wait(random.randint(2,3))
    html = requests.get(url, headers=request_headers(url))
    bs = BeautifulSoup(html.text, 'lxml')
    
    title = bs.find('title').get_text()
    authors = bs.find('div', {'class': 'auths'}).get_text()
    orgs = (bs
            .find('dl',{'class': 'ui-ncbi-toggler-slave'})
            .find_all('dd'))
    orgs = [o.get_text() for o in orgs]
    kwords = (bs
           .find('div', {'class': 'keywords'})
           .find('p')
           .get_text()
           .split())
    cit = bs.find('div', {'class': 'cit'}).get_text()
#     date = re.findall(r'\d{4}\s\w{3}\s\d{2}', citation)[0]
    return dict(title=title, authors=authors, organizations=orgs,
                keywords=kwords, citation=cit, url=url)

In [None]:
def extract_and_write(url):
    data = extract_page(url)
    return write_db(data)

### TODO: this shoud be multi threaded not processing

In [None]:
def process_many(urls):
    p = mp.Pool()
    for u in urls:
        p.apply_async(extract, args=(u,))
    
    p.close()
    p.join()

## Extract Information

In [None]:
bs = BeautifulSoup(browser.page_source, 'lxml')

In [None]:
divs = bs.find_all('div', {'class': 'rslt'})

Get author, title, journal and other metadata

In [None]:
urls = []
for d in divs:
    # get paper's link
    u = URL + d.find('a').get('href')
    urls.append(u)

In [None]:
# process_many(urls)

In [None]:
[extract_and_write(u) for u in urls]

https://www.ncbi.nlm.nih.gov/pubmed/32047606 already exists
https://www.ncbi.nlm.nih.gov/pubmed/32045572 already exists
https://www.ncbi.nlm.nih.gov/pubmed/32045022 already exists
https://www.ncbi.nlm.nih.gov/pubmed/32043133 already exists
https://www.ncbi.nlm.nih.gov/pubmed/32041316 already exists
https://www.ncbi.nlm.nih.gov/pubmed/32039117 already exists
https://www.ncbi.nlm.nih.gov/pubmed/32038138 already exists
https://www.ncbi.nlm.nih.gov/pubmed/32033231 already exists


AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
browser.implicitly_wait(1)

In [None]:
#     # Click next buttion to navigate to the next page
#     browser.find_element_by_xpath('//*[@title="Next page of results"]').click()