In [1]:
# Import required modules
import requests
from IPython.display import clear_output
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import itertools
import re
from datetime import datetime
today = datetime.today().date().strftime("%d_%b")

In [2]:
def generateCoverPageLinkAndDept(url):
    """Return individual department and cover page links."""
    
    coverPageLink = []
    dept = []
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    for lnk in s.find(id="content_163819").find_all("li"):
        for lnk2 in lnk.find_all("a"):
            """Make abs link, and nevigate to staff directories."""
            coverPageLink.append("https://www.gla.ac.uk"+lnk2.get("href")+"staff")
            dept.append(lnk2.text.strip())
            
    df = pd.DataFrame({
        "coverPageLink":coverPageLink,
        "dept":dept
    })
    return df

def scrapeIndividualStaffLink(url):
    """Scrape individual staff directory links."""
    
    # Store staff links
    staffLink = []
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    # All the links resides in attr "data-location"
    for lnk in s.find_all(lambda attr: attr.get("data-location")):
        tempLink = lnk.find("a").get("href") # Links with noises
        if "/".join(url.split("/")[3:]) in tempLink: # Filters our noises
            staffLink.append("https://www.gla.ac.uk"+tempLink) # Make them absolute link
    return staffLink

def scrapeStaffInfo(url):
    """Scrape individual staffs info."""
    
    # Variables to scrape
    name = []
    desigAndDept = []
    phoneAndEmail = []
    link = []
    
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    # Main container
    cont = s.find(class_="maincontent sp_content columns content eight large-8 medium-8")
    
    # Get link
    link.append(url)
    
    # Scrape name
    try:
        name.append(cont.find("h1").text.strip())
    except:
        name.append("na")
       
    # Scrape designation and department
    try:
        desigAndDept.append(cont.find(class_="sp_appointments").li.text.strip())
    except:
        desigAndDept.append("na")
    
    # Scrape phone no and email
    try:
        phoneAndEmail.append(cont.find(id="sp_contactInfo").p.text.strip())
    except:
        phoneAndEmail.append("na")
    
    # See the progress
    print(f"{url}")
    clear_output(wait=True)
        
    # Create a df off scraped variables
    df = pd.DataFrame({
        "name":name,
        "link":link,
        "desigAndDept":desigAndDept,
        "phoneAndEmail":phoneAndEmail
        
    })
        
    return df

def cleanAndEngineerFeatures(df, df2):
    """Takes a df to clean and create new features.
    Takes another df(df2) from which dept is extracted."""
    
    # Extract designation
    df["designation"] = df.desigAndDept.astype(str).str.split("(").str[0].str.strip()
    
    # Extract subject taught
    df["subject"] = df.desigAndDept.astype(str).str.split("(").str[1].str.strip().str.replace(")","")
    
    # Extract phone number
    df["phone"] = df.phoneAndEmail.apply(lambda x: re.findall(r"[0-9\+]", x)).str.join("")
    
    # Extract email
    df["email"] = df.phoneAndEmail.apply(lambda x: re.findall(r"[\w\.-]+@[\w\.-]+", x)).str.join("")
    
    # Create a temporay column that will be used as key to insert department 
    df["coverPageLink"] = df.link.astype(str).str.split("/").str[0:6].str.join("/")
    merged = pd.merge(df2, df, on="coverPageLink", how="left", indicator=True)
    mergedBoth = merged[merged._merge=="both"].reset_index(drop=True)
    return mergedBoth

##### Or we could use the following code but it's really hectic to type all the data-location attributes
staffLink = []
for loc in s.find_all(attrs={"data-location":["Glasgow",
                                              " ",
                                              "West Quadrangle",
                                             "200 Renfield Street",
                                              "Main Building",
                                              "Level 2, Administration Suite",
                                              "University Avenue",
                                              "West Quadrangle, Gilbert Scott Building",
                                              "ASBS North",
                                              "Glasgow G12 8QQ",
                                              "Gilbert Scott Building",
                                              "glasgow",
                                              "Glasgow g12 8qq",
                                              "G12 8qq",
                                              "Florentine House, 53 Hillhead Street",
                                              "College of Social Sciences",
                                              "Room 608",
                                              "463 Gilbert Scott Building"
                                             ]}):


    tempLink = loc.a.get("href")
    if "/schools/business/staff/" in tempLink:
        staffLink.append(tempLink)

In [3]:
# Wrap all the function inside main
def main(url, s1, s2):
    """S1 start index, s2 stop index."""
    
    # Scrape coverpages link and dept
    coverPageLinkAndDept = generateCoverPageLinkAndDept(url)
    
    # Scrapes ind staffs links
    with ThreadPoolExecutor() as executor:
        staffLink = list(executor.map(scrapeIndividualStaffLink, coverPageLinkAndDept.coverPageLink))
        staffLink = list(itertools.chain(*staffLink))
    
    # Scrapes ind staff info
    with ThreadPoolExecutor() as executor:
        df = pd.concat(list(executor.map(scrapeStaffInfo, staffLink[s1:s2]))).reset_index(drop=True)
        
    # Clean and engineer new features
    finalDf = cleanAndEngineerFeatures(df, coverPageLinkAndDept)
    
    # Drop unnecessary features
    finalDf.drop(["coverPageLink", "desigAndDept", "phoneAndEmail", "_merge"], axis=1, inplace=True)
    
    return finalDf

In [4]:
%%time
# Scrape in chunks
df1k = main("https://www.gla.ac.uk/schools/", 0, 1000)

CPU times: user 2min 29s, sys: 18.7 s, total: 2min 48s
Wall time: 8min 27s


In [5]:
%%time
df2k = main("https://www.gla.ac.uk/schools/", 1000, 2000)

CPU times: user 2min 13s, sys: 10.9 s, total: 2min 24s
Wall time: 11min 20s


In [6]:
%%time
df3k = main("https://www.gla.ac.uk/schools/", 2000, 3000)

CPU times: user 2min 9s, sys: 15.1 s, total: 2min 24s
Wall time: 7min 17s


In [11]:
%%time
df4k = main("https://www.gla.ac.uk/schools/", 3000, None)

CPU times: user 2min 23s, sys: 16.1 s, total: 2min 40s
Wall time: 8min 2s


In [12]:
# Concat all the dfs
masterDf = pd.concat([
    df1k,
    df2k,
    df3k, 
    df4k
]).reset_index(drop=True)

# Preview what we have
masterDf.head(10)

Unnamed: 0,dept,name,link,designation,subject,phone,email
0,Adam Smith Business School,Professor Nuran Acur,https://www.gla.ac.uk/schools/business/staff/n...,Professor of Innovation Management,Management,1413306367.0,Nuran.Acur@glasgow.ac.uk
1,Adam Smith Business School,Dr Ankush Agarwal,https://www.gla.ac.uk/schools/business/staff/a...,Lecturer in Quantitative Finance and Risk Mana...,Economics,1413306317.0,Ankush.Agarwal@glasgow.ac.uk
2,Adam Smith Business School,Dr Paul Ahn,https://www.gla.ac.uk/schools/business/staff/p...,Lecturer in Accounting and Finance,Accounting & Finance,1413305986.0,Paul.Ahn@glasgow.ac.uk
3,Adam Smith Business School,Professor Chandana Alawattage,https://www.gla.ac.uk/schools/business/staff/c...,"Professor in Accounting, Tax and Audit",Accounting & Finance,,Chandana.Alawattage@glasgow.ac.uk
4,Adam Smith Business School,Dr Mark Aleksanyan,https://www.gla.ac.uk/schools/business/staff/m...,Senior Lecturer,Accounting & Finance,1413303061.0,Mark.Aleksanyan@glasgow.ac.uk
5,Adam Smith Business School,Mr Rami Alsharif,https://www.gla.ac.uk/schools/business/staff/r...,Lecturer,Adam Smith Business School,,Rami.Alsharif@glasgow.ac.uk
6,Adam Smith Business School,Dr Stephanie Anderson,https://www.gla.ac.uk/schools/business/staff/s...,Lecturer in Marketing,Management,1413302950.0,Stephanie.Anderson@glasgow.ac.uk
7,Adam Smith Business School,Dr Aurelie Andry,https://www.gla.ac.uk/schools/business/staff/a...,Research Associate in International Economic H...,Management,4401413302077.0,Aurelie.Andry@glasgow.ac.uk
8,Adam Smith Business School,Dr Luis Angeles,https://www.gla.ac.uk/schools/business/staff/l...,Senior Lecturer,Economics,1413308517.0,Luis.Angeles@glasgow.ac.uk
9,Adam Smith Business School,Dr Konstantinos Angelopoulos,https://www.gla.ac.uk/schools/business/staff/k...,Reader,Economics,1413305273.0,Konstantinos.Angelopoulos@glasgow.ac.uk


In [14]:
# Create a csv file
masterDf.to_csv(f"glasgowUniversityStaffs_{today}.csv", index=None)