In [17]:
from bs4 import BeautifulSoup
import requests


class FacultyScraper:
    """
    A web scraping tool to extract data from a faculty directory website.

    Attributes:
        url (str): The URL of the faculty directory website.
        session (requests.Session): A session object to handle HTTP requests.
        soup (BeautifulSoup): A BeautifulSoup object to parse HTML content.
        list_of_dicts (list): A list of dictionaries containing faculty information.

    Methods:
        scrape_data(): Scrapes the data from the faculty directory website.
        make_request(): Sends an HTTP request to the specified URL.
        parse_html(): Parses the HTML content of the response.
        find_email_addresses(): Finds and stores the email addresses of faculty members.
        find_professors(): Finds and stores the names and colleges of faculty members.
        check_length(): Checks if the number of unique emails matches the number of faculty names.
        find_links(): Finds and stores the profile links of faculty members.
        create_faculty_dicts(): Creates faculty dictionaries with basic information.
        extract_subjects(): Extracts and stores the subjects taught by each faculty member.
        extract_research_topics(): Extracts and stores the research topics of each faculty member.
    """
    def __init__(self, url):
        """
        Initializes a FacultyScraper object.

        Args:
            url (str): The URL of the faculty directory website.
        """
        self.url = url
        self.session = requests.Session()
        self.soup = None
        self.list_of_dicts = []

    def scrape_data(self):
        """
        Scrapes the data from the faculty directory website.

        Returns:
            list: A list of dictionaries containing faculty information.
        """
        self.make_request()
        self.parse_html()
        self.find_email_addresses()
        self.find_professors()
        self.check_length()
        self.find_links()
        self.create_faculty_dicts()
        self.extract_subjects()
        self.extract_research_topics()
        return self.list_of_dicts

    def make_request(self):
        """
        Sends an HTTP request to the specified URL.
        """
        self.response = self.session.get(self.url)

    def parse_html(self):
        """
        Parses the HTML content of the response.
        """
        self.soup = BeautifulSoup(self.response.content, "html.parser")

    def find_email_addresses(self):
        """
        Finds and stores the email addresses of faculty members.
        """
        email_addresses = [
            link.get("href").replace("mailto:", "")
            for link in self.soup.find_all("a", href=lambda href: href and href.startswith("mailto:"))
        ]
        self.unique_emails = list(set(email_addresses))

    def find_professors(self):
        """
        Finds and stores the names and colleges of faculty members.
        """
        professors = self.soup.find_all(
            "div", class_="profileinfo-teaser-name")
        self.names = []
        self.prof_college = []
        for div in professors:
            name_parts = div.text.split(',')
            name = name_parts[0].strip()
            self.prof_college.append(name_parts[2].strip())
            if "PhD" in name_parts[1]:
                new_name = "Dr. " + name
                self.names.append(new_name)
            else:
                self.names.append(name)

    def check_length(self):
        """
        Checks if the number of unique emails matches the number of faculty names.
        """
        self.is_length_equal = len(self.unique_emails) == len(self.names)

    def find_links(self):
        """
        Finds and stores the profile links of faculty members.
        """
        professors = self.soup.find_all(
            "div", class_="profileinfo-teaser-name")
        self.links = [
            "https://engineering.buffalo.edu/" +
            div.find('a', class_='title')['href'][:-4] + "teaching.html"
            for div in professors
        ]

    def create_faculty_dicts(self):
        """
        Creates faculty dictionaries with basic information.
        """
        for name, college, email, profile_url in zip(self.names, self.prof_college, self.unique_emails, self.links):
            faculty_dict = {
                "Name": name,
                "College": college,
                "Email": email,
                "Subjects": [],
                "Research": [],
                "Profile": profile_url  # Add the 'Profile' key with the URL
            }
            self.list_of_dicts.append(faculty_dict)

    def extract_subjects(self):
        """
        Extracts and stores the subjects taught by each faculty member.
        """
        for i in range(len(self.names)):
            url = self.links[i]
            response = self.session.get(url)
            soup = BeautifulSoup(response.content, "html.parser")
            div = soup.find("div", class_="text parbase section")
            ul = div.find("ul")
            subjects = []
            faculty_dict = {}  # Initialize faculty_dict here
            try:
                for li in ul.find_all("li"):
                    subjects.append(li.text)
                faculty_dict = self.list_of_dicts[i]
                faculty_dict["Subjects"] = subjects
            except:
                faculty_dict["Subjects"] = []

    def extract_research_topics(self):
        """
        Extracts and stores the research topics of each faculty member.
        """
        for i in range(len(self.list_of_dicts)):
            url = self.list_of_dicts[i]["Profile"]
            response = self.session.get(url)
            soup = BeautifulSoup(response.content, "html.parser")
            research_topics = []
            try:
                for div in soup.find_all("div", class_="profileinfo-interest title"):
                    raw_string = div.text[15:].strip()
                    research_topics.extend(raw_string.strip().split("; "))
                self.list_of_dicts[i]["Research"] = research_topics
            except:
                self.list_of_dicts[i]["Research"] = []


In [18]:
url = "https://engineering.buffalo.edu/computer-science-engineering/people/faculty-directory/full-time.html"
scraper = FacultyScraper(url)
data = scraper.scrape_data()


In [16]:
data

[{'Name': 'Dr. Nasrin Akhter',
  'College': 'George Mason University',
  'Email': 'avereshc@buffalo.edu',
  'Subjects': ['CSE 116—Introduction to Computer Science II (Spring 2021)',
   'CSE 191—Introduction to Discrete Structures (Fall 2023, Summer 2023, Spring 2023, Fall 2022, Summer 2022, Spring 2022, Fall 2021, Summer 2021, Spring 2021)',
   'CSE 331—Algorithms and Complexity (Spring 2023, Spring 2022)',
   'CSE 469—Introduction to Data Mining (Fall 2021)'],
  'Research': ['Machine learning', 'computational biology'],
  'Profile': 'https://engineering.buffalo.edu//computer-science-engineering/people/faculty-directory/full-time.host.html/content/shared/engineering/computer-science-engineering/profiles/faculty/teaching/akhter-nasrin.teaching.html'},
 {'Name': 'Dr. Carl Alphonce',
  'College': 'University of British Columbia',
  'Email': 'pauldick@buffalo.edu',
  'Subjects': ['CSE 115—Introduction to Computer Science I (Fall 2023, Fall 2022, Fall 2021, Spring 2021, Fall 2020, Summer 20