In [1]:
import numpy as np
import pandas as pd
import os
import shutil
from datetime import datetime
import urllib.request
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
from random import randint
from time import sleep

In [2]:
# Specifying the URL of desired web page to be scrapped
starting_url = "https://papers.nips.cc/book/advances-in-neural-information-processing-systems-32-2019"
website_url = "https://papers.nips.cc"

In [3]:
# Creating an html document from the URL
uastring = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
req = urllib.request.Request(
    starting_url,
    data=None,
    headers={'User-Agent': uastring}
)

try:
    session = urllib.request.urlopen(req)
except HTTPError as e:
    print('The server could not serve up the page!')
    print(e)
    sys.exit(1)
except URLError as e:
    print('The server could not be reached!')
    print(e)
    sys.exit(1)

try:
    webpage = BeautifulSoup(session.read(), 'html5lib')
    main_title = webpage.body.h2
except AttributeError as e:
    print('Page title could not be found - Might indicate problem!')
    sys.exit(1)
else:
    print('Successfully accessed the web page: ' + main_title.string)

Successfully accessed the web page: Advances in Neural Information Processing Systems 32 (NIPS 2019) pre-proceedings


In [4]:
collection = webpage.find_all('li')
collection.pop(0)

# Setting up a dataframe to capture the records
df = pd.DataFrame(columns=['title','authors','doc_link', 'abstract', 'pdf_link', 'supplemental_link', 'event_type'])
i = 0

for item in collection:
    doc_title = "[Not Found]"
    authors = "[Not Found]"
    doc_link = "[Not Found]"
    
    abstract = "[Not Found]"
    pdf_link = "[Not Found]"
    supplemental_link = "[Not Found]"
    event_type = "[Not Found]"

    doc_title = item.a.string
    author_group = item.find_all('a', {'class':'author'})
    author_list = []
    for each_author in author_group:
        author_list.append(each_author.string)
    authors = ','.join(author_list)
    doc_link = website_url + item.a['href']

    # Adding random wait time so we do not hammer the website needlessly
    waitTime = randint(3,8)
    sleep(waitTime)
    print("Waited " + str(waitTime) + " seconds to retrieve the next URL.")
    req = urllib.request.Request(
        doc_link,
        data=None,
        headers={'User-Agent': uastring}
    )

    try:
        session = urllib.request.urlopen(req)
    except HTTPError as e:
        print('The server could not serve up the page!')
        print(e)
        sys.exit(1)
    except URLError as e:
        print('The server could not be reached!')
        print(e)
        sys.exit(1)

    try:
        docpage = BeautifulSoup(session.read(), 'html5lib')
        docpage_title = docpage.body.h2
    except AttributeError as e:
        print('Page title could not be found - Might indicate problem!')
        sys.exit(1)
        
    artifact_list = docpage.find('div', class_="main wrapper clearfix").find_all('a')
    for artifact_item in artifact_list:
        if artifact_item.string == "[PDF]":
            pdf_link = website_url + artifact_item['href']
        
        if artifact_item.string == "[Supplemental]":
            supplemental_link = website_url + artifact_item['href']
            


    for header in docpage.find('div', class_="main wrapper clearfix").find_all('h3'):
        if "Conference Event Type" in header.text:
            event_type = header.text.split(":")[1].lower()
    
    abstract = docpage.find('p', class_="abstract").string
    df.loc[i] = [doc_title, authors, doc_link, abstract, pdf_link, supplemental_link, event_type]
    i = i + 1

Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 3 se

Waited 8 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 6 se

Waited 6 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 5 se

Waited 7 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 8 se

Waited 8 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 8 se

Waited 6 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 4 se

Waited 3 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 6 se

Waited 3 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 4 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 5 seconds to retrieve the next URL.
Waited 6 seconds to retrieve the next URL.
Waited 8 seconds to retrieve the next URL.
Waited 7 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 3 seconds to retrieve the next URL.
Waited 4 se

In [5]:
df.to_csv("neurips2019.csv")