# Developing and the PhD Portal Scraper

**Scrape phdportal.com to find new PhD opportunties**

* Data Collection: Selenium and Chrome webdriver for webscraping
* **This only works on one 'field' at a time, so the link has to be manually input**
* **PhD Portal has extensive webscraping blocking**
* Data Extraction: Beautiful soup is used to parse HTML elements saved as lists
* Data Storage: Lists are combined into a dataframe for further processing. 
* Data Cleaning: Data is reformated/cleaned to ensure correct data types for later extraction

In [1]:
import time
import random
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service

In [2]:
# Create a dictionary calss to store the elements extracted
class PhDOpportunity:
    def __init__(self, study_name, organisation_name, organisation_location, tuition_fee, duration, summary):
        self.study_name = study_name
        self.organisation_name = organisation_name
        self.organisation_location = organisation_location
        self.tuition_fee = tuition_fee
        self.duration = duration
        self.summary = summary

    def to_dict(self):
        return {
            'study_name': self.study_name,
            'organisation_name': self.organisation_name,
            'organisation_location': self.organisation_location,
            'tuition_fee': self.tuition_fee,
            'duration': self.duration,
            'summary': self.summary
        }

In [6]:
# This is better because it determines the number of phd opportunities and only clicks 
# the next button that many times

# Set the location of the Chrome webdriver (this will be differnt on each machine)
webdriver_location = r'C:\Users\Lenovo V15\Downloads\chromedriver_win32.exe'

# Set the base URL of the webpage 
base_url = 'https://www.phdportal.com/search/phd/engineering-technology/united-states'

# Initialize a list to store the parsed data
scraped_pages = []

chrome_service = Service(executable_path=webdriver_location)
driver = webdriver.Chrome(service=chrome_service)


driver.get(base_url)

try:
# Find the element with the ID 'CookieButton' and click it
    cookie_button = driver.find_element(By.ID, 'CookieButton')
    cookie_button.click()
except:
    pass
    
# Wait for the page to load completely
wait = WebDriverWait(driver, 5)

# Check if the OnboardingDismiss Close NotificationButton button is present and click it
try:
    onboarding_dismiss_button = wait.until(EC.presence_of_element_located((By.XPATH, '//button[contains(@class, "OnboardingDismiss") and contains(@class, "Close") and contains(@class, "NotificationButton")]')))
    onboarding_dismiss_button.click()
except:
    pass

# Parse the first page HTML
page_html = driver.page_source
soup = BeautifulSoup(page_html, 'html.parser')
scraped_pages.append(soup)

# Extract the total number of PhDs from the first page
search_summary = soup.find("span", class_="SearchSummary")
first = soup.find("div", class_="ShowResultsButtonContainer")
second = first.find("button", class_="ShowResults").text
process = second.replace(' Show ', '').replace(' results ', '')
total_phds = int(process)
                 
# Calculate the number of times to click the NextButton
phds_per_page = 20
num_clicks = (total_phds // phds_per_page) - 1

# Click the NextButton 'num_clicks' times
for _ in range(num_clicks):
    try:
        next_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(@class, "NextButton") and contains(@class, "NavigatorButton")]')))
        next_button.click()

        # Collect the webpage HTML after its loading
        page_html = driver.page_source

        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(page_html, 'html.parser')

            # Extract the data you need from the HTML using BeautifulSoup
            # And append it to the list
        scraped_pages.append(soup)

    except:
        break

# Close the webdriver after the loop is done
driver.quit()


In [7]:
# Check how many pages were scraped
len(scraped_pages)

28

In [9]:
# Initialize an empty DataFrame to store the final results
df_final = pd.DataFrame()

for i in range(len(scraped_pages)):
    soup = scraped_pages[i]
    # Extract the search result items
    search_result_items = soup.find_all('li', class_='HoverEffect SearchResultItem')
    phd_opportunities = []

    # Iterate through the search result items and extract the relevant information
    for item in search_result_items:
        study_name = item.find('h2', class_='StudyName').text if item.find('h2', class_='StudyName') else None

        name_location = item.find('div', class_='NameLocation')
        organisation_name = name_location.find('strong', class_='OrganisationName').text if name_location else None
        organisation_location = name_location.find('strong', class_='OrganisationLocation').text if name_location else None

        study_info = item.find('div', class_='StudyInfo')
        tuition_fee = study_info.find('div', class_='TuitionFee').text.strip() if study_info and study_info.find('div', class_='TuitionFee') else None
        duration = study_info.find('div', class_='Duration').text.strip() if study_info and study_info.find('div', class_='Duration') else None

        summary = item.find('p', class_='Summary is-collapsed').text.strip() if item.find('p', class_='Summary is-collapsed') else None

        phd_opportunity = PhDOpportunity(study_name, organisation_name, organisation_location, tuition_fee, duration, summary)
        phd_opportunities.append(phd_opportunity)

    # Convert the list of PhDOpportunity objects to a list of dictionaries
    opportunity_data = [opp.to_dict() for opp in phd_opportunities]

    # Create a DataFrame using the list of dictionaries
    df = pd.DataFrame(opportunity_data)

    # Concatenate the DataFrame with the previous results
    df_final = pd.concat([df_final, df])

# Display the final DataFrame
df_final.shape

(644, 6)

In [10]:
df_final.head(10)

Unnamed: 0,study_name,organisation_name,organisation_location,tuition_fee,duration,summary
0,Computer Science and Engineering,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States","21,572 EUR / year",5 years,The PhD in Computer Science and Engineering fr...
1,Mechanical Engineering,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States","21,572 EUR / year",5 years,Mechanical Engineering from the University at ...
2,Computational and Data-Enabled Sciences,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States","21,572 EUR / year",5 years,The integration of large computing and big dat...
3,Electrical Engineering,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States","21,572 EUR / year",5 years,The Electrical Engineering program at the Univ...
4,Aerospace Engineering,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States","21,572 EUR / year",5 years,The Aerospace Engineering degree from the Univ...
5,Industrial Engineering,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States","21,572 EUR / year",5 years,The Industrial Engineering PhD program from th...
6,,,,,,
7,Civil Engineering,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States","21,572 EUR / year",5 years,The Department offers two doctor of philosophy...
8,Biomedical Engineering,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States","21,572 EUR / year",5 years,The PhD degree in Biomedical Engineering from ...
9,Materials Design and Innovation,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States","21,572 EUR / year",5 years,The Materials Design and Innovation department...


In [11]:
# Function to format the duration column

def format_duration(duration_str):
    if duration_str == "Duration unknown":
        return None
    
    duration_str = duration_str.replace("½", ".5")
    duration_parts = duration_str.split()
    
    if len(duration_parts) > 0:
        return float(duration_parts[0])
    else:
        return None

In [14]:
# Parsing the collected data

df_processed = df_final.dropna(thresh=len(df.columns) - 2).copy()

base = base_url.replace('https://www.phdportal.com/search/phd/', '')
field = base.replace('/united-states', '')

df_processed['field'] = field
df_processed['tuition_fee'] = df_processed['tuition_fee'].apply(lambda x: int(x.split()[0].replace(',', '')) if x and 'EUR' in x and x.split()[0].replace(',', '').isdigit() else None)
df_processed['city'] = df_processed['organisation_location'].apply(lambda x: x.replace(', United States', ''))
df_processed['duration'] = df_processed['duration'].apply(format_duration)

In [15]:
df_processed.head()

Unnamed: 0,study_name,organisation_name,organisation_location,tuition_fee,duration,summary,field,city
0,Computer Science and Engineering,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States",21572.0,5.0,The PhD in Computer Science and Engineering fr...,engineering-technology,Buffalo
1,Mechanical Engineering,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States",21572.0,5.0,Mechanical Engineering from the University at ...,engineering-technology,Buffalo
2,Computational and Data-Enabled Sciences,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States",21572.0,5.0,The integration of large computing and big dat...,engineering-technology,Buffalo
3,Electrical Engineering,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States",21572.0,5.0,The Electrical Engineering program at the Univ...,engineering-technology,Buffalo
4,Aerospace Engineering,University at Buffalo SUNY - School of Enginee...,"Buffalo, United States",21572.0,5.0,The Aerospace Engineering degree from the Univ...,engineering-technology,Buffalo


In [None]:
folder_path = r'C:/Users/Lenovo V15/Documents/0.0 TDI Data Science Fellowship/Capstone/phd_portal_results'
df_processed.to_csv(f'{folder_path}/PhDPortal_{field}.csv', index=False)