In [1]:
# Import required modules
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import itertools
from IPython.display import clear_output
from datetime import datetime
today = datetime.today().date().strftime("%d_%b")

In [2]:
def scrapeIndvidualStaffLinkAndDept(url):
    """Returns indvidual links."""
    
    # To store indvidual staffs links
    staffLink = []
    dept = []
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
    r = requests.get(url, headers=headers)
    s = BeautifulSoup(r.text, "lxml")
    
    # Scrape ind staffs links
    for lnk in s.find_all("span", class_="name"):
        staffLink.append("https://www.cubsucc.com/"+lnk.find("a").get("href"))
    
    # Scrape dept also
    for dpt in s.find_all("span", class_="department"):
        dept.append(dpt.text.strip())
    
    df = pd.DataFrame({
        "staffLink":staffLink,
        "dept":dept
    })
    return df

def scrapeStaffInfo(url):
    """Returns staff info."""
    
    # Variables to scrape
    name = []
    details = []
    staffLink = []
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    # Get staff link
    staffLink.append(url)
    
    # Name
    try:
        name.append(s.find(id="pageTitle").text.strip())
    except:
        name.append("na")
        
    # Details    
    try:
        details.append(s.find("div", class_="profileInfoText").text.strip())
    except:
        details.append("na")
        
    # See the progress
    print(f"{url}")
    clear_output(wait=True)
    
    # Create a df off scrapes variables
    df = pd.DataFrame({
    "name":name,
    "staffLink":staffLink,
    "details":details
    })
    
    return df

In [3]:
# Wrap all the function inside main
def main(url):
    """Use threading"""
    
    # This returns a df of staff link and dept.
    staffLinkAndDept = scrapeIndvidualStaffLinkAndDept(url)
    
    # Scrape satff info
    with ThreadPoolExecutor() as executor:
        df = pd.concat(list(executor.map(scrapeStaffInfo, staffLinkAndDept.staffLink)))
        
    # Insert dept by merging two dfs on staff links  
    mergedDept = pd.merge(df, staffLinkAndDept, on="staffLink", how="left")

    return mergedDept

In [None]:
%%time
# Call the main function
masterDf = main("https://www.cubsucc.com/faculty-directory/departments/")

# Let's see what we have
masterDf.head(10)

https://www.cubsucc.com/faculty-directory/ms-claire-fennell/


In [None]:
# Create a csv file
masterDf.to_csv(f"corkUniversityStaffs_{today}.csv", index=None)