# Notebook to use the data got from the PhD scraper

## Define keywords of interest and countries to exclude

In [221]:
keywords = ['Computer Vision']  # <- CUSTOMIZE THIS
excluded_countries = ['England']  # <- CUSTOMIZE THIS
included_countries = []  # <- CUSTOMIZE THIS (Overrides excluded_countries)

## Imports and data loading

In [222]:
# Imports
import pandas as pd
import textwrap
import re
import os

from dateutil import parser

In [223]:
# Loading Data
phd_positions = pd.read_csv("data/phd_positions.csv")
phd_positions.head()

Unnamed: 0,university,title,description,application_deadline,application_link,source_url
0,05 Fully Funded PhD Programs at Tampere Univer...,1. Fully Funded PhD Position in Robotics and AI,The Cognitive Robotics Group focuses on roboti...,2025-08-06,https://tuni.rekrytointi.com/paikat/?o=A_RJ&jg...,https://fellowshipbard.com/05-fully-funded-phd...
1,05 Fully Funded PhD Programs at Tampere Univer...,2.Fully Funded PhD Position in speech technolo...,We are inviting applications for a doctoral re...,2025-08-26,https://tuni.rekrytointi.com/paikat/?o=A_RJ&jg...,https://fellowshipbard.com/05-fully-funded-phd...
2,05 Fully Funded PhD Programs at Tampere Univer...,3. 02Fully Funded PhD Position in Power electr...,The research group of Power Electronics at the...,2025-09-01,https://tuni.rekrytointi.com/paikat/?o=A_RJ&jg...,https://fellowshipbard.com/05-fully-funded-phd...
3,10 Fully Funded PhD Programs at University of ...,1. Fully Funded PhD Position in Systems pathol...,Being overweight is a known risk factor for de...,17 August 2025,https://www.jobs.cam.ac.uk/job/51872/,https://fellowshipbard.com/10-fully-funded-phd...
4,10 Fully Funded PhD Programs at University of ...,2.Fully Funded PhD Position in Small Animal Su...,You will help to facilitate the smooth running...,31 July 2025,https://www.jobs.cam.ac.uk/job/51451/,https://fellowshipbard.com/10-fully-funded-phd...


## Basic cleaning

### Split the 'university' column into 'university', 'town', and 'country'

In [224]:
# Extract the University after "at" the last two comma-separated parts as town and country
phd_positions[['university_clean', 'town', 'country']] = phd_positions['university'].str.extract(r'at (.*),\s*([^,]+),\s*([^,]+)$')
phd_positions.drop(columns=['university'], inplace=True)

# Keep only the university name in the 'university_clean' column
phd_positions = phd_positions.rename(columns={'university_clean': 'university'})

### Delete rows where the 'application_deadline' is in the past

In [225]:
phd_positions.shape

(880, 8)

In [226]:
# function to parse the application deadline
def parse_deadline(val):
    if isinstance(val, str) and 'open until filled' in val.lower():
        return 'Open until filled'
    try:
        # Use dateutil.parser to handle various date formats
        return parser.parse(val, fuzzy=True).date()
    except:
        return pd.NaT

In [227]:
# Today's date
today = pd.Timestamp('today').normalize()

# Apply parsing
phd_positions['parsed_deadline'] = phd_positions['application_deadline'].apply(parse_deadline)

# Filter out past deadlines (excluding "Open until filled")
phd_positions_future = phd_positions[
    (phd_positions['parsed_deadline'] == 'Open until filled') |
    (pd.to_datetime(phd_positions['parsed_deadline'], errors='coerce') >= today)
].copy()

# Convert to consistent string format (e.g., YYYY-MM-DD)
phd_positions_future['parsed_deadline'] = phd_positions_future['parsed_deadline'].apply(
    lambda x: x if x == 'Open until filled' else pd.to_datetime(x).strftime('%Y-%m-%d')
)

phd_positions_future.shape

(725, 9)

### Delete all rows that are double entries

In [228]:
# delete double entries
phd_positions_future = phd_positions_future.drop_duplicates(subset=['title', 'university', 'parsed_deadline', 'description'])

phd_positions_future.shape

(145, 9)

## Filtering out the positions one is interested in

In [229]:
# Combine title and description, convert to lowercase
phd_positions_future['search_text'] = (phd_positions_future['title'] + ' ' + phd_positions_future['description']).str.lower()

# Create keyword match mask
if keywords:
    keyword_mask = phd_positions_future['search_text'].apply(lambda x: any(kw.lower() in x for kw in keywords))
else:
    keyword_mask = True  # include all rows if no keywords provided

# Determine country mask based on inclusion or exclusion
if included_countries:
    country_mask = phd_positions_future['country'].isin(included_countries)
else:
    country_mask = ~phd_positions_future['country'].isin(excluded_countries)

# Apply both filters
phd_positions_filtered = phd_positions_future[keyword_mask & country_mask]

phd_positions_filtered.head()

Unnamed: 0,title,description,application_deadline,application_link,source_url,university,town,country,parsed_deadline,search_text
71,11.Fully Funded PhD Position in Applied Mathem...,"Within our division, we have extensive experie...",27.Jul.2025,https://lu.varbi.com/en/what:job/jobID:825466/...,https://fellowshipbard.com/16-fully-funded-phd...,Lund University,Lund,Sweden,2025-07-27,11.fully funded phd position in applied mathem...


In [230]:
# Define the output file
output_file = 'output/filtered_phd_positions.txt'
index = 1

# make the output directory if it does not exist
os.makedirs("output", exist_ok=True)

# Open file for writing
with open(output_file, 'w', encoding='utf-8') as f:
    for idx, row in phd_positions_filtered.iterrows():
        # remove leading numbers in title
        clean_title = re.sub(r'^\s*\d+\.\s*', '', row['title'])
        # Wrap title and description
        wrapped_title = textwrap.fill(clean_title, width=80)
        wrapped_description = textwrap.fill(row['description'], width=80)

        # Build the formatted entry
        entry = (
            f"{'='*100}\n"
            f"PhD Position {index}\n"
            f"Title:\n{wrapped_title}\n\n"
            f"University: {row['university']}\n"
            f"Country: {row['country']}\n"
            f"Town: {row['town']}\n"
            f"Deadline: {row['parsed_deadline']}\n"
            f"Application Link: {row.get('application_link', 'N/A')}\n\n"
            f"Description:\n{wrapped_description}\n"
        )

        # Write to file
        f.write(entry + "\n")
        index += 1

print(f"Saved {index-1} PhD positions to: {output_file}")

Saved 1 PhD positions to: output/filtered_phd_positions.txt
