## Scrape NeurIPS Workshop

In [None]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import re

In [None]:
neurIPS_base = 'https://nips.cc/Conferences/2019/Schedule'
workshop_base = 'https://nips.cc/Conferences/2019/Schedule?showEvent='
speaker_base = 'https://nips.cc/Conferences/2019/Schedule?showSpeaker='

In [None]:
schedule_page = requests.get(f'{neurIPS_base}')    
schedule_page = BeautifulSoup(schedule_page.content, "lxml")
workshops = schedule_page.find('main').find_all(class_='maincard narrower Workshop')

In [None]:
workshop_infos = []

for workshop in tqdm(workshops):
    
    # Obtain the ID for each workshop
    script_parent = workshop.find_parent()['onclick']
    workshop_id = re.findall('(\d+)', script_parent)[0]
    workshop_name = workshop.find(class_="maincardBody").text
    
    # Pull workshop Detailed information in their homepages
    workshop_page = requests.get(f'{workshop_base}{workshop_id}')
    workshop_page = BeautifulSoup(workshop_page.content, "lxml")
    
    # Workshop Link and Speakers information appears in buttons
    buttons = workshop_page.find(id='main').find_all(class_='btn btn-default')
    
    # Obtain the workshop link
    # Sometimes there could be no link for some workshops, so we should take care
    possible_link_buttons = list(filter(lambda x: not x.has_attr('onclick'), buttons))
    workshop_link = possible_link_buttons[0]['href'] if possible_link_buttons else ''
    
    # Obtain the workshop abstract
    workshop_abstract = workshop_page.find(id='main').find(class_='abstractContainer').get_text("\n")
    
    # Obtain the workshop speakers 
    possible_speaker_buttons = list(filter(lambda x: x.has_attr('onclick'), buttons))
    if possible_speaker_buttons:
        speaker_ids = [re.findall("\('(.+)'\)",speaker_button['onclick'])[0] for speaker_button in possible_speaker_buttons]
        speaker_infos = [None] * len(speaker_ids)
    
        for i, speaker_id in enumerate(speaker_ids):
            speaker_page = requests.get(f'{speaker_base}{speaker_id}')
            speaker_page = BeautifulSoup(speaker_page.content, "lxml")
            speaker_name = speaker_page.find('h3').text
            speaker_affliation = speaker_page.find('h4').text
            speaker_infos[i] = {'name': speaker_name, 'affilation': speaker_affliation}
    else:
        speaker_infos = []
        
    workshop_info = {
        'workshop_id': workshop_id,
        'workshop_name': workshop_name,
        'workshop_link': workshop_link,
        'workshop_abstract': workshop_abstract,
        'speaker_info': speaker_infos
    }
    
    workshop_infos.append(workshop_info)

In [None]:
import json
with open('workshop_info.json', 'w') as fp:
    
    json.dump(workshop_infos, fp)

---

## (Optional) Organize the Data in Notion

With the help of [Notion-Py](https://github.com/jamalex/notion-py) lib, we can easily get access to our Notion pages and organize the information we just scraped. 

In [None]:
token = '<Your Token>'
page_link = 'Your Page'

In [None]:
from notion.client import NotionClient
from notion.block import *

In [None]:
# Obtain the `token_v2` value by inspecting your browser cookies on a logged-in session on Notion.so
client = NotionClient(token_v2=token)

# Replace this URL with the URL of the page you want to edit
page = client.get_block(page_link)

In [None]:
for item in tqdm(workshop_infos[34:]):
    if item['workshop_link']:
        title = f"[{item['workshop_name']}]({item['workshop_link']})" 
    else:
        title = f"{item['workshop_name']}" 
    block = page.children.add_new(ToggleBlock, title=title)
    block.children.add_new(TextBlock, title=f"Link: {item['workshop_link']}", color='gray')
    speaker_block = block.children.add_new(ToggleBlock, title='Speakers', color='gray')
    for speaker in item['speaker_info']:
        title = f"**{speaker['name']}** - {speaker['affilation']}"
        speaker_block.children.add_new(TextBlock, title=title, color='gray')
    block.children.add_new(TextBlock, title=item['workshop_abstract'])