In [1]:
# Import required modules
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import itertools
from datetime import datetime
today = datetime.today().date().strftime("%d_%b")

In [2]:
def generateCoverPageLink(url):
    """Returns coverpage links."""
    coverPageLink = [f"{url}{p}"
                    for p in range(1,20)]
    return coverPageLink


def scrapeInstructorInfo(url):
    """Return instructors info."""
    
    # Variables to be scraped
    name = []
    designationDeptAndEmail = []
    
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    for txt in s.find_all("div", class_="col-sm-12"):
        """Filter out contents without email."""
        if "Email" in txt.text.strip():
            # Scrape name
            for n in txt.find_all("h5", class_="mtm"):
                name.append(n.text.strip())
            # Scrape designation, department and email together
            designationDeptAndEmail.append(txt.find_all("p")[1].text.strip())
    
    # Extract designation
    designation = list(map(lambda x: x.split("|")[0].strip(), designationDeptAndEmail))
    
    # Extract department
    dept = list(map(lambda x: x.split("|")[1].strip(), designationDeptAndEmail))
    
    # Extract email
    email = list(map(lambda x: x.split("|")[-1].strip().replace("Email:", "").replace("\xa0", "").strip(), designationDeptAndEmail))
    
    # Create a dataframe off those variables
    df = pd.DataFrame({
        "name":name,
        "designation":designation,
        "dept":dept,
        "email":email
    })
    
    return df

In [3]:
# Wrap all the function inside main
def main(url):
    """Uses threading"""
    # Generates cover page links
    coverPageLink = generateCoverPageLink(url)
    
    # Scrape instructor information
    with ThreadPoolExecutor() as executor:
        df = pd.concat(list(executor.map(scrapeInstructorInfo, coverPageLink))).reset_index(drop=True)
    return df

In [4]:
%%time
# Call the main function
masterDf = main("https://www.coventry.ac.uk/search/?searchText=faculty&PageSize=10&page=")

# Preview the scraped data
masterDf.head(10)

CPU times: user 4.12 s, sys: 955 ms, total: 5.08 s
Wall time: 7.14 s


Unnamed: 0,name,designation,dept,email
0,Aqueela Ahmed,Lecturer in Oil and Gas Management (Finance),"Faculty of Engineering, Environment and Computing",ab5073@coventry.ac.uk
1,Les Duckers,Principal Lecturer in Natural & Environmental ...,"School of Energy, Construction and Environment",byx919@coventry.ac.uk
2,Ian Farrin,Employment Tutor,"Faculty of Engineering, Environment and Computing",ab1978@coventry.ac.uk
3,Wayne Harrop,Senior Lecturer in Business Continuity Managem...,"Faculty of Engineering, Environment and Computing",aa0530@coventry.ac.uk
4,Lisa Payne,Honorary Lecturer in Computing,"Faculty of Engineering, Environment and Computing",csx067@coventry.ac.uk
5,Dr James Shuttleworth,Associate Head of Computing and the Digital En...,"Faculty of Engineering, Environment and Computing",csx239@coventry.ac.uk
6,Xiang Fei,Senior Lecturer in Computing,"Faculty of Engineering, Environment and Computing",aa5861@coventry.ac.uk
7,Kamal Bentahar,Assistant Lecturer in Computing,"Faculty of Engineering, Environment and Computing",ab3735@coventry.ac.uk
8,Mahmoud Awad,Assistant Lecturer at Games Tech and Multimedi...,"Faculty of Engineering, Environment and Computing",ab8505@coventry.ac.uk
9,Antal Goldschmidt,Assistant Lecturer in Computing,"Faculty of Engineering, Environment and Computing",ab2216@coventry.ac.uk


In [5]:
# Create a csv file
masterDf.to_csv(f"coventryUniversityStaffs_{today}.csv", index=None)