# Notebook to use the data got from the PhD scraper

## Define keywords of interest and countries to exclude

In [21]:
keywords = ['computer vision', 'deep learning', 'machine learning', 'ML', 'AI', 'artificial intelligence']  # <- CUSTOMIZE THIS
excluded_countries = ['England']  # <- CUSTOMIZE THIS
included_countries = ['Switzerland']  # <- CUSTOMIZE THIS (Overrides excluded_countries)

## Imports and data loading

In [22]:
# Imports
import pandas as pd
import textwrap
import re
import os

from dateutil import parser

In [23]:
# Loading Data
phd_positions = pd.read_csv("data/phd_positions.csv")
phd_positions.head()

Unnamed: 0,university,title,description,application_deadline,application_link,source_url
0,10 Fully Funded PhD Programs at University of ...,1. Fully Funded PhD Position in Systems pathol...,Being overweight is a known risk factor for de...,17 August 2025,https://www.jobs.cam.ac.uk/job/51872/,https://fellowshipbard.com/10-fully-funded-phd...
1,10 Fully Funded PhD Programs at University of ...,2.Fully Funded PhD Position in Small Animal Su...,You will help to facilitate the smooth running...,31 July 2025,https://www.jobs.cam.ac.uk/job/51451/,https://fellowshipbard.com/10-fully-funded-phd...
2,10 Fully Funded PhD Programs at University of ...,3. 06Fully Funded PhD Position in Small Animal...,Junior Clinical Training Scholars will receive...,16 July 2025,https://www.jobs.cam.ac.uk/job/51450/,https://fellowshipbard.com/10-fully-funded-phd...
3,10 Fully Funded PhD Programs at University of ...,4.Fully Funded PhD Position in Identifying nov...,"Malaria, caused by Plasmodium parasites and tr...",13 July 2025,https://www.jobs.cam.ac.uk/job/51587/,https://fellowshipbard.com/10-fully-funded-phd...
4,14 Fully Funded PhD Programs at Aalto Universi...,1. Fully Funded PhD Position in sustainable AI...,The rapid growth of artificial intelligence (A...,31.10.2025,https://www.aalto.fi/en/open-positions/doctora...,https://fellowshipbard.com/14-fully-funded-phd...


## Basic cleaning

### Split the 'university' column into 'university', 'town', and 'country'

In [24]:
# Extract the University after "at" the last two comma-separated parts as town and country
phd_positions[['university_clean', 'town', 'country']] = phd_positions['university'].str.extract(r'at (.*),\s*([^,]+),\s*([^,]+)$')
phd_positions.drop(columns=['university'], inplace=True)

# Keep only the university name in the 'university_clean' column
phd_positions = phd_positions.rename(columns={'university_clean': 'university'})

### Delete rows where the 'application_deadline' is in the past

In [25]:
phd_positions.shape

(930, 8)

In [26]:
# function to parse the application deadline
def parse_deadline(val):
    if isinstance(val, str) and 'open until filled' in val.lower():
        return 'Open until filled'
    try:
        # Use dateutil.parser to handle various date formats
        return parser.parse(val, fuzzy=True).date()
    except:
        return pd.NaT

In [27]:
# Today's date
today = pd.Timestamp('today').normalize()

# Apply parsing
phd_positions['parsed_deadline'] = phd_positions['application_deadline'].apply(parse_deadline)

# Filter out past deadlines (excluding "Open until filled")
phd_positions_future = phd_positions[
    (phd_positions['parsed_deadline'] == 'Open until filled') |
    (pd.to_datetime(phd_positions['parsed_deadline'], errors='coerce') >= today)
].copy()

# Convert to consistent string format (e.g., YYYY-MM-DD)
phd_positions_future['parsed_deadline'] = phd_positions_future['parsed_deadline'].apply(
    lambda x: x if x == 'Open until filled' else pd.to_datetime(x).strftime('%Y-%m-%d')
)

phd_positions_future.shape

(755, 9)

### Delete all rows that are double entries

In [28]:
# delete double entries
phd_positions_future = phd_positions_future.drop_duplicates(subset=['title', 'university', 'parsed_deadline', 'description'])

phd_positions_future.shape

(151, 9)

## Filtering out the positions one is interested in

In [29]:
# Combine title and description, convert to lowercase
phd_positions_future['search_text'] = (phd_positions_future['title'] + ' ' + phd_positions_future['description']).str.lower()

# Create keyword match mask
keyword_mask = phd_positions_future['search_text'].apply(lambda x: any(kw.lower() in x for kw in keywords))

# Determine country mask based on inclusion or exclusion
if included_countries:
    country_mask = phd_positions_future['country'].isin(included_countries)
else:
    country_mask = ~phd_positions_future['country'].isin(excluded_countries)

# Apply both filters
phd_positions_filtered = phd_positions_future[keyword_mask & country_mask]

phd_positions_filtered.head()

Unnamed: 0,title,description,application_deadline,application_link,source_url,university,town,country,parsed_deadline,search_text
138,2.Fully Funded PhD Position in modeling fractu...,The Computational Mechanics of Building Materi...,Open until filled,https://jobs.ethz.ch/job/view/JOPG_ethz_Npzgf4...,https://fellowshipbard.com/12-fully-funded-phd...,ETH Zurich,Zürich,Switzerland,Open until filled,2.fully funded phd position in modeling fractu...
139,3.Fully Funded PhD Position in Decentralized R...,The Distributed Computing group at ETH Zurich ...,Open until filled,https://jobs.ethz.ch/job/view/JOPG_ethz_WY2twh...,https://fellowshipbard.com/12-fully-funded-phd...,ETH Zurich,Zürich,Switzerland,Open until filled,3.fully funded phd position in decentralized r...
140,4.Fully Funded PhD Position in Biomedical Robo...,Hydrocephalus is a severe and common neurologi...,Open until filled,https://jobs.ethz.ch/job/view/JOPG_ethz_iBLZIr...,https://fellowshipbard.com/12-fully-funded-phd...,ETH Zurich,Zürich,Switzerland,Open until filled,4.fully funded phd position in biomedical robo...
141,5.Fully Funded PhD Position in Seed Pathogens ...,The reproduction of many plants is characteriz...,Open until filled,https://jobs.ethz.ch/job/view/JOPG_ethz_XLdEZk...,https://fellowshipbard.com/12-fully-funded-phd...,ETH Zurich,Zürich,Switzerland,Open until filled,5.fully funded phd position in seed pathogens ...
144,8.Fully Funded PhD Position in quantum metamat...,The PhD project aims to develop a solution-der...,August 10th 2025,https://jobs.ethz.ch/job/view/JOPG_ethz_tVhBwH...,https://fellowshipbard.com/12-fully-funded-phd...,ETH Zurich,Zürich,Switzerland,2025-08-10,8.fully funded phd position in quantum metamat...


In [30]:
# Define the output file
output_file = 'output/filtered_phd_positions.txt'
index = 1

# make the output directory if it does not exist
os.makedirs("output", exist_ok=True)

# Open file for writing
with open(output_file, 'w', encoding='utf-8') as f:
    for idx, row in phd_positions_filtered.iterrows():
        # remove leading numbers in title
        clean_title = re.sub(r'^\s*\d+\.\s*', '', row['title'])
        # Wrap title and description
        wrapped_title = textwrap.fill(clean_title, width=80)
        wrapped_description = textwrap.fill(row['description'], width=80)

        # Build the formatted entry
        entry = (
            f"{'='*100}\n"
            f"PhD Position {index}\n"
            f"Title:\n{wrapped_title}\n\n"
            f"University: {row['university']}\n"
            f"Country: {row['country']}\n"
            f"Town: {row['town']}\n"
            f"Deadline: {row['parsed_deadline']}\n"
            f"Application Link: {row.get('application_link', 'N/A')}\n\n"
            f"Description:\n{wrapped_description}\n"
        )

        # Write to file
        f.write(entry + "\n")
        index += 1

print(f"Saved {index-1} PhD positions to: {output_file}")

Saved 5 PhD positions to: output/filtered_phd_positions.txt
