# Lab: Web Scraping + File I/O

##### Instructions: 

1. Go to https://polisci.wustl.edu/people/88/all OR https://polisci.wustl.edu/people/list/88/all
2. Go to the page for each of the professors.
3. Create a `.csv`` file with the following information for each professor:
	- Name
	- Title
	- E-mail
	- Web page
	- Specialization  
		- If they do not have a specialization, you can leave it blank. 
		- An example from Deniz's page: https://polisci.wustl.edu/people/deniz-aksoy
		- Professor Aksoyâ€™s research is motivated by an interest in comparative political institutions and political violence. 

In [1]:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import csv
import time
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

names, emails, titles, web_pages, specializations = [], [], [], [], []
url = 'https://polisci.wustl.edu/people/88/all'

service = Service('/usr/bin/chromedriver')
driver = webdriver.Chrome(service=service)
driver.get(url)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  
time.sleep(5)
page_source = driver.page_source

soup = BeautifulSoup(page_source, 'html.parser')

links = soup.find_all('a', href=True)
prof_links = [link['href'] for link in links if bool(re.search(r'/people/\b[a-z]+-[a-z]+\b',link['href']))]

for prof_link in prof_links:
    prof_url = 'https://polisci.wustl.edu' + prof_link
    prof_web_page = urllib.request.urlopen(prof_url)
    prof_soup = BeautifulSoup(prof_web_page.read())

    name = prof_soup.find('meta', property='og:title')['content']
    title = prof_soup.find('div', class_='title').get_text()
    email = prof_soup.find('a', href=lambda href: href and href.startswith('mailto:'))['href'][7:]

    web_page_url = ""
    try:
        ul_tag = prof_soup.find('ul', class_='links')
        web_page_url = ul_tag.find('li').find('a')['href']
    except:
        pass

    specialization = [] 
    try:
        ul_tag = prof_soup.find('ul', class_="interests")
        interests = ul_tag.find_all('li')
        for interest in interests:
            specialization.append(re.sub(r'[\n]+','',interest.get_text()).strip())
    except:
        pass

    names.append(name)
    titles.append(title)
    emails.append(email)
    web_pages.append(web_page_url)
    specializations.append(specialization)

df = pd.DataFrame({'name': names,
                   'title': titles,
                   'emails': emails,
                   'web_page': web_pages,
                   'specialization': specializations})

df.to_csv("wustl.csv")