In [None]:
from util import linkedin_scraper, jobnet_scraper, jobindex_scraper
import pandas as pd
import os

In [None]:
title = "Data Scientist" # Notice, the search function within each Job Board will allow to find similar positions, e.g. "Data Analyst"
city = "Aarhus C"
postal = "8000"
country = "Denmark" 
street = "Ryesgade 1" # Random location in Aarhus C
num_jobs = 50 # Number of jobs that each scraper should fetch at most
km_dist = 70 # Search radius in kilometers (Jobnet and JobIndex only)

# Scrape new job listings

In [None]:
linkedin_df = linkedin_scraper(title, city, num_jobs)
jobnet_df = jobnet_scraper(title, city, postal, km_dist, num_jobs)
jobindex_df = jobindex_scraper(title, city, postal, street, km_dist, num_jobs)

# Merge all new job listings
df = pd.concat([linkedin_df, jobnet_df, jobindex_df], ignore_index=True)
df["applied_date"] = None
df["reply"] = None
df["cover_letter"] = None
df["decision"] = None
df["decision_reason"] = None
df["last_updated"] = None
df["cover_letter"] = None
df["cv"] = None
df

# Filter the data

In [None]:
# Here you might apply custom filtering, if the scrapers have fetched too many irrelevant jobs

# Merge with existing jobs.csv and deduplicate

In [None]:
if not os.path.exists("jobs.csv"):
    job_df = pd.DataFrame(columns=df.columns)
    job_df.to_csv("jobs.csv", index=False) 
job_df = pd.read_csv("jobs.csv")

df = pd.concat([df, job_df], ignore_index=True)
len_before = len(df)
df_dups = df[df.duplicated(subset=['company', 'title'], keep=False)].sort_values(by=['company', 'title'])
# when dropping duplicates, some of the information might be lost, e.g. if one row has a non-null description but the other has null
# we want to keep the non-null description
# so we can use groupby with agg to keep the first non-null value for each column
dedup_keys = ['company', 'title']
agg_funcs = {col: 'first' for col in df.columns if col not in dedup_keys}
if 'cover_letter' in df.columns:
    agg_funcs['cover_letter'] = lambda s: s.dropna().iloc[0] if s.notna().any() else None
if dedup_keys and agg_funcs:
    df = (
        df.groupby(dedup_keys, as_index=False)
        .agg(agg_funcs))
elif dedup_keys: # fallback if no agg_funcs defined
    df = df.groupby(dedup_keys, as_index=False).first()
len_after = len(df)
if len_before != len_after:
    print(f"Removed {len_before-len_after} duplicate rows, {len_before} -> {len_after}")

In [None]:
# Print out any duplicates that were found to double check that they were handled correctly
df_dups

In [None]:
# Show the resulting rows that will be kept from the duplicates
df_dups_new = []
if len(df_dups) > 0:
    df_dups_list = df_dups[['company', 'title']].drop_duplicates().values.tolist()
    for company, title in df_dups_list:
        df_dups_new.append(df[(df['company'] == company) & (df['title'] == title)])
pd.concat(df_dups_new, ignore_index=True)

In [None]:
# Save updated jobs.csv
df.to_csv("jobs.csv", index=False)