In [1]:
# Import required modules
import pandas as pd
from IPython.display import clear_output
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import itertools
from datetime import datetime
today = datetime.today().date().strftime("%d_%b")

In [2]:
def generateCoverPageLink(url):
    """Returns indvidual department directory links."""
    
    # To store all depts cover page links
    coverPageLink = []
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    # Indvidual dept links are in the two classes
    for lnk in s.find_all("div", class_=["list1", "list2"]):
        for lnk2 in lnk.find_all("a"):
            coverPageLink.append(url+lnk2.get("href"))
    return coverPageLink

def scrapeIndvidualStaffLink(url):
    """Returns indvidual links from every dept links."""
    
    # To store indvidual staffs links
    staffLink = []
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    # Scrape ind staffs links
    for lnk in s.find_all("div", class_="staff_list"):
        for lnk2 in lnk.find_all("ul"):
            for lnk3 in lnk2.find_all("a"):
                staffLink.append("http://staff.southwales.ac.uk"+lnk3.get("href"))
    return staffLink

def scrapeStaffInfo(url):
    """Returns indvidual staffs info."""
    
    # Variables to scrape
    name = []
    department = []
    designation = []
    email = []
    profileLink = []
    
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    # Get profile link
    profileLink.append(url)
    
    # Name
    try:
        name.append(s.find("span", class_="fn").text.strip())
    except:
        name.append("na")
        
    # Department
    try:
        department.append(s.find("li", class_="user_department").text.strip())
    except:
        department.append("na")
        
    # Designation
    try:
        designation.append(s.find("li", class_="user_job_title").text.strip())
    except:
        designation.append("na")
    
    # Email
    try:
        email.append(s.find("li", class_="user_email").text.strip())
    except:
        email.append("na")
        
    # See the progress    
    print(f"{url}")
    clear_output(wait=True)
        
    
    # Create a dataframe with the scraped info
    df = pd.DataFrame({
    "profileLink":profileLink,
    "name":name,
    "department":department,
    "designation":designation,
    "email":email
    })
    
    return df

In [3]:
# Wrap all the function inside main
def main(url,s1, s2):
    """Choose threading. s1= start index, s2= end index."""
    # Cover page links
    coverPageLink = generateCoverPageLink(url)
    
    # Indvidual staffs link
    with ThreadPoolExecutor() as executor:
        staffLink = list(executor.map(scrapeIndvidualStaffLink, coverPageLink))
        staffLink = list(itertools.chain(*staffLink))
        
    # Scrape staff info
    with ThreadPoolExecutor() as executor:
        df = pd.concat(list(executor.map(scrapeStaffInfo, staffLink[s1:s2]))).reset_index(drop=True)
    
    return df

In [4]:
%%time
# We will scrape in 3 chunks
df1k = main("http://staff.southwales.ac.uk/", 0, 1000)

CPU times: user 35.7 s, sys: 4.25 s, total: 40 s
Wall time: 1min 37s


In [5]:
%%time
df2k = main("http://staff.southwales.ac.uk/", 1000, 2000)

CPU times: user 34.2 s, sys: 3.4 s, total: 37.7 s
Wall time: 1min 45s


In [8]:
%%time
df3k = main("http://staff.southwales.ac.uk/", 2000, None)

CPU times: user 41.4 s, sys: 7.36 s, total: 48.8 s
Wall time: 56.1 s


In [9]:
# Concat all the dfs
masterDf = pd.concat([
    df1k,
    df2k,
    df3k
]).reset_index(drop=True)

# Preview what we have
masterDf.head(10)

Unnamed: 0,profileLink,name,department,designation,email
0,http://staff.southwales.ac.uk/users/11715-zabawe/,Zulfia Abawe,Academic Registry,Agency Staff,Zulfia.Abawe@southwales.ac.uk
1,http://staff.southwales.ac.uk/users/11614-madams/,Margaret Adams,Academic Registry,Examination Invigilator,margaret.adams@southwales.ac.uk
2,http://staff.southwales.ac.uk/users/11580-kadams/,Kathryn Adams,Academic Registry,Examination Invigilator,kathryn.adams@southwales.ac.uk
3,http://staff.southwales.ac.uk/users/11790-malh...,Murad Al-Hamwi,Academic Registry,Digital Content Coordinator,murad.al-hamwi1@southwales.ac.uk
4,http://staff.southwales.ac.uk/users/5382-mandr...,Mark Andrews,Academic Registry,Exam Invigilator and Graduations Assistant,mark.andrews@southwales.ac.uk
5,http://staff.southwales.ac.uk/users/6398-matye...,Michelle Atyeo-Thomas,Academic Registry,Senior Registry Officer (SA),michelle.atyeo-thomas@southwales.ac.uk
6,http://staff.southwales.ac.uk/users/9186-lbarnes/,Lionel Barnes,Academic Registry,Exam Invigilator,lionel.barnes@southwales.ac.uk
7,http://staff.southwales.ac.uk/users/11584-kbate/,Kim Bate,Academic Registry,Examination Invigilator,kim.bate@southwales.ac.uk
8,http://staff.southwales.ac.uk/users/10717-pbay...,Patricia Bayley,Academic Registry,Exam Invigilator,patricia.bayley@southwales.ac.uk
9,http://staff.southwales.ac.uk/users/1571-gbeach1/,Gareth Beach,Academic Registry,Principal QAE Officer (FCI),gareth.beach@southwales.ac.uk


In [10]:
# Save as csv file
masterDf.to_csv(f"newSouthWalesStaffs_{today}.csv", index=None)