In [1]:
from bs4 import BeautifulSoup
import requests
import concurrent.futures


class FacultyScraper:
    """
    A web scraping tool to extract data from a faculty directory website.

    Attributes:
        url (str): The URL of the faculty directory website.
        session (requests.Session): A session object to handle HTTP requests.
        soup (BeautifulSoup): A BeautifulSoup object to parse HTML content.
        list_of_dicts (list): A list of dictionaries containing faculty information.

    Methods:
        scrape_data(): Scrapes the data from the faculty directory website.
        make_request(): Sends an HTTP request to the specified URL.
        parse_html(): Parses the HTML content of the response.
        find_email_addresses(): Finds and stores the email addresses of faculty members.
        find_professors(): Finds and stores the names and colleges of faculty members.
        check_length(): Checks if the number of unique emails matches the number of faculty names.
        find_links(): Finds and stores the profile links of faculty members.
        create_faculty_dicts(): Creates faculty dictionaries with basic information.
        extract_subjects(): Extracts and stores the subjects taught by each faculty member.
        extract_research_topics(): Extracts and stores the research topics of each faculty member.
    """

    def __init__(self, url):
        """
        Initializes a FacultyScraper object.

        Args:
            url (str): The URL of the faculty directory website.
        """
        self.url = url
        self.session = requests.Session()
        self.soup = None
        self.list_of_dicts = []

    def scrape_data(self):
        """
        Scrapes the data from the faculty directory website.

        Returns:
            list: A list of dictionaries containing faculty information.
        """
        self.make_request()
        self.parse_html()
        self.find_email_addresses()
        self.find_professors()
        self.check_length()
        self.find_links()
        self.create_faculty_dicts()

        # Create a ThreadPoolExecutor with maximum threads
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            # Execute extract_subjects and extract_research_topics concurrently
            subject_futures = [executor.submit(
                self.extract_subjects, url) for url in self.links]
            research_futures = [executor.submit(
                self.extract_research_topics, url) for url in self.links]

            # Retrieve the results from the futures
            subjects_list = [
                future.result() for future in concurrent.futures.as_completed(subject_futures)]
            research_list = [
                future.result() for future in concurrent.futures.as_completed(research_futures)]

        # Update the subjects and research topics in the faculty dictionaries
        for i in range(len(self.list_of_dicts)):
            self.list_of_dicts[i]["Subjects"] = subjects_list[i]
            self.list_of_dicts[i]["Research"] = research_list[i]

        return self.list_of_dicts

    def make_request(self):
        """
        Sends an HTTP request to the specified URL.
        """
        self.response = self.session.get(self.url)

    def parse_html(self):
        """
        Parses the HTML content of the response.
        """
        self.soup = BeautifulSoup(self.response.content, "html.parser")

    def find_email_addresses(self):
        """
        Finds and stores the email addresses of faculty members.
        """
        email_addresses = [
            link.get("href").replace("mailto:", "")
            for link in self.soup.find_all("a", href=lambda href: href and href.startswith("mailto:"))
        ]
        self.unique_emails = []
        for mail in email_addresses:
            if mail not in self.unique_emails:
                self.unique_emails.append(mail)

    def find_professors(self):
        """
        Finds and stores the names and colleges of faculty members.
        """
        professors = self.soup.find_all(
            "div", class_="profileinfo-teaser-name")
        self.names = []
        self.prof_college = []
        for div in professors:
            name_parts = div.text.split(',')
            name = name_parts[0].strip()
            self.prof_college.append(name_parts[2].strip())
            if "PhD" in name_parts[1]:
                new_name = "Dr. " + name
                self.names.append(new_name)
            else:
                self.names.append(name)

    def check_length(self):
        """
        Checks if the number of unique emails matches the number of faculty names.
        """
        self.is_length_equal = len(self.unique_emails) == len(self.names)

    def find_links(self):
        """
        Finds and stores the profile links of faculty members.
        """
        professors = self.soup.find_all(
            "div", class_="profileinfo-teaser-name")
        self.links = [
            "https://engineering.buffalo.edu/" +
            div.find('a', class_='title')['href'][:-4] + "teaching.html"
            for div in professors
        ]

    def create_faculty_dicts(self):
        """
        Creates faculty dictionaries with basic information.
        """
        for name, college, email, profile_url in zip(self.names, self.prof_college, self.unique_emails, self.links):
            faculty_dict = {
                "Name": name,
                "College": college,
                "Email": email,
                "Subjects": [],
                "Research": [],
                "Profile": profile_url  # Add the 'Profile' key with the URL
            }
            self.list_of_dicts.append(faculty_dict)

    def extract_subjects(self, url):
        """
        Extracts and returns the subjects taught by a faculty member.

        Args:
            url (str): The URL of the faculty member's profile page.

        Returns:
            list: A list of subjects taught by the faculty member.
        """
        response = self.session.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        div = soup.find("div", class_="text parbase section")
        ul = div.find("ul")
        subjects = []
        try:
            for li in ul.find_all("li"):
                subjects.append(li.text)
        except:
            pass
        return subjects

    def extract_research_topics(self, url):
        """
        Extracts and returns the research topics of a faculty member.

        Args:
            url (str): The URL of the faculty member's profile page.

        Returns:
            list: A list of research topics of the faculty member.
        """
        response = self.session.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        research_topics = []
        try:
            for div in soup.find_all("div", class_="profileinfo-interest title"):
                raw_string = div.text[15:].strip()
                research_topics.extend(raw_string.strip().split("; "))
        except:
            pass
        return research_topics


In [2]:
url = "https://engineering.buffalo.edu/computer-science-engineering/people/faculty-directory/full-time.html"
scraper = FacultyScraper(url)
data = scraper.scrape_data()


In [3]:
import pandas as pd 

In [4]:
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,Name,College,Email,Subjects,Research,Profile
0,Dr. Nasrin Akhter,George Mason University,nasrinak@buffalo.edu,"[CSE 220—Systems Programming (Fall 2023, Sprin...","[Applied cryptography, information security, p...",https://engineering.buffalo.edu//computer-scie...
1,Dr. Carl Alphonce,University of British Columbia,alphonce@buffalo.edu,[CDA 501—Introduction to Data Driven Analysis ...,[Computer science education],https://engineering.buffalo.edu//computer-scie...
2,Dr. Ethan Blanton,Purdue University,eblanton@buffalo.edu,[CSE 368—Introduction to Artificial Intelligen...,"[Big data analytics, anomaly detection]",https://engineering.buffalo.edu//computer-scie...
3,Dr. Marina Blanton,Purdue University,mblanton@buffalo.edu,"[CSE 486—Distributed Systems (Fall 2020, Fall ...","[Operating systems, networking, compilers and ...",https://engineering.buffalo.edu//computer-scie...
4,Dr. Varun Chandola,University of Minnesota,chandola@buffalo.edu,"[CSE 220—Systems Programming (Fall 2019), CSE ...","[Machine learning, computational biology]",https://engineering.buffalo.edu//computer-scie...
5,Dr. Changyou Chen,Australian National University,changyou@buffalo.edu,"[CSE 410—Special Topics (Fall 2021, Fall 2020)...","[Large-scale Bayesian sampling and inference, ...",https://engineering.buffalo.edu//computer-scie...
6,Dr. Karthik Dantu,University of Southern California,kdantu@buffalo.edu,"[CSE 410—Special Topics (Spring 2022, Spring 2...","[Computer vision, machine learning, multimodal...",https://engineering.buffalo.edu//computer-scie...
7,Dr. Sreyasee Das Bhattacharjee,Indian Institute of Technology,sreyasee@buffalo.edu,[CSE 115—Introduction to Computer Science I (F...,"[Document image understanding, video analysis,...",https://engineering.buffalo.edu//computer-scie...
8,Dr. Murat Demirbas,The Ohio State University,demirbas@buffalo.edu,[CSE 116—Introduction to Computer Science II (...,"[Mobile sensor networks, robot networks, netwo...",https://engineering.buffalo.edu//computer-scie...
9,Dr. Paul Dickson,University of Massachusetts Amherst,pauldick@buffalo.edu,[CSE 473—Introduction to Computer Vision and I...,"[Pattern recognition, digital libraries, biome...",https://engineering.buffalo.edu//computer-scie...
