In [11]:
%load_ext autoreload
%autoreload 2
from bs4 import BeautifulSoup
import urllib3
import re
from urllib.request import Request
from urllib.parse import quote, unquote
from typing import Union, List
from time import sleep

USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14'
HEADERS = {'User-Agent': USER_AGENT,'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
SLEEP_TIME_BETWEEN_SEARCHES = 1.5
SCHOLAR_URL = "https://scholar.google.com/scholar?" \
    + "hl=en" \
    + "&q={query}"

def encode_url(url, params_dict):
    for k, v in params_dict.items():
        params_dict[k] = quote(str(v))
    return url.format(**params_dict)

class GoogleScholarParser:
    
    def __init__(self, base_url=SCHOLAR_URL):
        self.base_url = base_url
        self.http = urllib3.PoolManager()
        self.soup = None
        
    def get_gs_paper_titles(self, page=None):
        if page is None:
            page = self.soup
        if page is None:
            return None
        paper_titles_h3 = page.find_all(lambda tag: tag.name == "h3" and
                                        tag.has_key("class") and "gs_rt" in tag["class"])
        paper_titles = []
        for h3_element in paper_titles_h3:
            paper_title_a = h3_element.find(lambda tag: tag.name == "a")
            paper_title = paper_title_a.text
            paper_titles.append(paper_title)
        return paper_titles

    def get_number_of_citations(self, paper_title: str) -> Union[int, None]:
        sleep(SLEEP_TIME_BETWEEN_SEARCHES)
        search_url = encode_url(self.base_url, {"query": paper_title})
        page = self.http.request('GET', search_url, headers=HEADERS)
        self.soup = BeautifulSoup(page.data, 'html.parser')
        print(self.soup)
        tag = self.soup.find_all(lambda tag: tag.name == "a" and "Cited by" in tag.text)[0]
        cited_regex_string = "Cited by (\d+)"
        regex_match = re.search(cited_regex_string, tag.text)
        if regex_match:
            return int(regex_match.group(1))
    
    def get_citation_urls(self, paper_title: str):
        try:
            search_url = encode_url(self.base_url, {"query": paper_title})
            page = self.http.request('GET', search_url)
            self.soup = BeautifulSoup(page.data, 'html.parser')
            tag = self.soup.find_all(lambda tag: tag.name == "a" and "Cited by" in tag.text)[0]
            cites_page = self.http.request('GET', "https://scholar.google.com"+tag["href"])
            cites_soup = BeautifulSoup(cites_page.data, 'html.parser')
            print(self.get_gs_paper_titles(cites_soup))
        except Exception as e:
            print("ERROR: " + repr(e))
            
class NeuripsTitleExtractor:
    def __init__(self, year):
        if year < 1987:
            raise ValueError("The first NeurIPS happened in 1987.")
        self.page_url = "https://papers.nips.cc/book/advances-in-neural-information-processing-systems-32-{year}"
        self.page_url = encode_url(self.page_url, {"year": year})
        self.http = urllib3.PoolManager()
        self.page = self.http.request("GET", self.page_url)
        self.soup = BeautifulSoup(self.page.data, 'html.parser')
    
    def get_titles(self):
        titles = []
        list_tags = self.soup.find_all(lambda tag: tag.name == "li")
        for tags in list_tags:
            children = list(tags.children)
            if len(children) > 1:
                if children[0].text is not None:
                    titles.append(children[0].text)
        return titles

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
titles = NeuripsTitleExtractor(2015).get_titles()
gsp = GoogleScholarParser()

for title in titles:
    print(title)
    print(gsp.get_number_of_citations(title))


Double or Nothing: Multiplicative Incentive Mechanisms for Crowdsourcing
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html>
<head><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="initial-scale=1" name="viewport"/><title>https://scholar.google.com/scholar?hl=en&amp;q=Double%20or%20Nothing%3A%20Multiplicative%20Incentive%20Mechanisms%20for%20Crowdsourcing</title></head>
<body onload="e=document.getElementById('captcha');if(e){e.focus();}" style="font-family: arial, sans-serif; background-color: #fff; color: #000; padding:20px; font-size:18px;">
<div style="max-width:400px;">
<hr noshade="" size="1" style="color:#ccc; background-color:#ccc;"/><br/>
<form action="index" id="captcha-form" method="post">
<script async="" defer="" src="https://www.google.com/recaptcha/api.js"></script>
<script>var submitCallback = function(response) {document.getElementById('captcha-form').submit();};</script>
<div class="g-recaptcha" data-callback="submi

IndexError: list index out of range