# Notebook to use the data got from the PhD scraper

## Define keywords of interest and countries to exclude

In [None]:
keywords = ['computer vision', 'deep learning', 'machine learning']  # <- CUSTOMIZE THIS
excluded_countries = ['England']  # <- CUSTOMIZE THIS

## Imports and data loading

In [None]:
# Imports
import pandas as pd
import datetime
import textwrap
import re

from dateutil import parser

In [None]:
# Loading Data
phd_positions = pd.read_csv("phd_positions.csv")
phd_positions.head()

## Basic cleaning

### Split the 'university' column into 'university', 'town', and 'country'

In [None]:
# Extract the University after "at" the last two comma-separated parts as town and country
phd_positions[['university_clean', 'town', 'country']] = phd_positions['university'].str.extract(r'at (.*),\s*([^,]+),\s*([^,]+)$')
phd_positions.drop(columns=['university'], inplace=True)

# Keep only the university name in the 'university_clean' column
phd_positions = phd_positions.rename(columns={'university_clean': 'university'})

### Delete rows where the 'application_deadline' is in the past

In [None]:
phd_positions.shape

In [None]:
# function to parse the application deadline
def parse_deadline(val):
    if isinstance(val, str) and 'open until filled' in val.lower():
        return 'Open until filled'
    try:
        # Use dateutil.parser to handle various date formats
        return parser.parse(val, fuzzy=True).date()
    except:
        return pd.NaT

In [None]:
# Today's date
today = pd.Timestamp('today').normalize()

# Apply parsing
phd_positions['parsed_deadline'] = phd_positions['application_deadline'].apply(parse_deadline)

# Filter out past deadlines (excluding "Open until filled")
phd_positions_future = phd_positions[
    (phd_positions['parsed_deadline'] == 'Open until filled') |
    (pd.to_datetime(phd_positions['parsed_deadline'], errors='coerce') >= today)
]

# Convert to consistent string format (e.g., YYYY-MM-DD)
phd_positions_future['parsed_deadline'] = phd_positions_future['parsed_deadline'].apply(
    lambda x: x if x == 'Open until filled' else pd.to_datetime(x).strftime('%Y-%m-%d')
)

phd_positions_future.shape

### Delete all rows that are double entries

In [None]:
# delete double entries
phd_positions_future = phd_positions_future.drop_duplicates(subset=['title', 'university', 'parsed_deadline', 'description'])

phd_positions_future.shape

## Filtering out the positions one is interested in

In [None]:
# Combine title and description, convert to lowercase
phd_positions_future['search_text'] = (phd_positions_future['title'] + ' ' + phd_positions_future['description']).str.lower()

# Create keyword match mask
keyword_mask = phd_positions_future['search_text'].apply(lambda x: any(kw.lower() in x for kw in keywords))

# Create country exclusion mask
country_mask = ~phd_positions_future['country'].isin(excluded_countries)

# Apply both filters
phd_positions_filtered = phd_positions_future[keyword_mask & country_mask]

phd_positions_filtered.head()

In [None]:
# Define the output file
output_file = 'filtered_phd_positions.txt'
index = 1

# Open file for writing
with open(output_file, 'w', encoding='utf-8') as f:
    for idx, row in phd_positions_filtered.iterrows():
        # remove leading numbers in title
        clean_title = re.sub(r'^\s*\d+\.\s*', '', row['title'])
        # Wrap title and description
        wrapped_title = textwrap.fill(clean_title, width=80)
        wrapped_description = textwrap.fill(row['description'], width=80)

        # Build the formatted entry
        entry = (
            f"{'='*100}\n"
            f"PhD Position {index}\n"
            f"Title:\n{wrapped_title}\n\n"
            f"University: {row['university']}\n"
            f"Country: {row['country']}\n"
            f"Town: {row['town']}\n"
            f"Deadline: {row['parsed_deadline']}\n"
            f"Application Link: {row.get('application_link', 'N/A')}\n\n"
            f"Description:\n{wrapped_description}\n"
        )

        # Write to file
        f.write(entry + "\n")
        index += 1

print(f"Saved {index-1} PhD positions to: {output_file}")