In [25]:
from jobspy import scrape_jobs
import pandas as pd
from IPython.display import display, HTML
import csv, datetime, time, os, json
import random, glob
from pprint import pprint

# Hybrid(office) proximity
# --- Later ---
# LLM ethos check



In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

In [28]:
# Import user configs

def insert_user(user, email):
    
    if not os.path.exists(f'users/{user}'):

        os.mkdir(f'users/{user}')
        user_dict = {'user': user, 'email': email}
        config_path = os.path.join(f'users/{user}', f'{user}_config.json')
        with open(config_path, 'w') as f:
            json.dump(user_dict, f, indent=4)

    else :
        print(f'User {user} already exists')


#insert_user('tdawg', 'test@test.de')


basic_PJ = {
    "platform":"indeed", 
    "location": "England-remote", 
    "kewwords": ["python", "data"],
            
    "days_left": 7,
    "interval": "daily"
    }


user_config_paths = glob.glob('users/**/*.json')
print(user_config_paths)

user_configs = []
for user_config_path in user_config_paths:
    with open(user_config_path) as f:
        tmp_dict = json.load(f)
        user_configs.append(tmp_dict)

for user in user_configs:
   print(user)



def order_PJ(user, P):
    pass

def show_stats():
    pass


User tdawg already exists
['users/tdawg/tdawg_config.json', 'users/lucky/lucky_config.json']
{'user': 'tdawg', 'email': 'test@test.de'}
{'user': 'lucky', 'email': 's4mipojo@uni-trier.de'}


In [None]:


keywords = ['python', 'database', 'nlp','natural language',
            'signal processing', 'prompt engineer', 'openai', 'language model', 'generative model',
            'genai','ki', 'künstliche intelligenz', 'ai', 'artificial intelligence']

def update_keywords_left(keywords: list, date: str) -> list:
    
    files = glob.glob("results/*.csv")
    files = [os.path.basename(file) for file in files]
    keywords_done = [keyword for keyword in keywords if f"{keyword}_{date}.csv" in files]
    keywords_left = [keyword for keyword in keywords if keyword not in keywords_done]

    print(f"Date: {date}")
    print(f"Keywords done: {len(keywords_done)} : {keywords_done}")
    print(f"Keywords left: {len(keywords_left)} : {keywords_left}")
    return keywords_left

def get_jobs(keywords: list) -> None:

    now = datetime.datetime.now()
    date = now.strftime("%Y%m%d")

    keywords_left = update_keywords_left(keywords, date)
    
    proxy_path = os.path.join("results", "proxy.txt")
    with open(proxy_path, "r") as f:
        proxy = f.read().strip()
    
    empty_jobs = pd.DataFrame(columns=[ "job_url",
    "site", "title", "company", "company_url", "location", "job_type",
    "date_posted", "interval", "min_amount", "max_amount", "currency",
    "is_remote", "num_urgent_words", "benefits", "emails", "description"])
    
    while keywords_left:
    
        keyword = random.choice(keywords_left)
        kw_path = os.path.join("results", f"{keyword}_{date}.csv")

        print(f"Keyword: {keyword} -> Starting search...")

        try:
            jobs = scrape_jobs(
                site_name=["indeed"],

                search_term=keyword,
                proxy=proxy,

                hours_old=72,
                is_remote=True,
                results_wanted=500,
                country_indeed='germany'  # only needed for indeed / glassdoor
            )

        except Exception as e:
            print(f"Error with keyword: {keyword}")
            print(e)

            if "Bad proxy" in str(e):
                # stop the program if the proxy is bad
                print("Bad proxy, stopping the program")
                break

            if "Could not find any results for the search" in str(e):
                empty_jobs.to_csv(kw_path, quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
                keywords_left = update_keywords_left(keywords, date)
            continue

        jobs=jobs.drop_duplicates(subset=["job_url"], keep="first")
        # listing with the same title from the same company are considered duplicates, because
        # they are usually the same job offer, just posted multiple times for different locations
        jobs=jobs.drop_duplicates(subset=["title", "company"], keep="first")

        print(f"Number of (unique) jobs found: {len(jobs)}")

        
        jobs.to_csv(kw_path, quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
        keywords_left = update_keywords_left(keywords, date)

get_jobs(keywords)

In [5]:
# check dates and show results - meta table

file_paths = glob.glob("results/*.csv")

file_names = [os.path.basename(file) for file in file_paths]
file_names = sorted(file_names, reverse=True)

dates = set([file.split("_")[1].split(".")[0] for file in file_names])
print(f"Dates: {dates}")

for input_date in dates:

    keywords_date = [file for file in file_names if input_date in file]
    length = len(keywords_date)
    lengths = []

    print(f" --- {length} Keywords for {input_date} : {keywords_date}")

    # print the length of the dataframes vertically in table format

    for file in keywords_date:
        df = pd.read_csv(os.path.join("results", file))
        lengths.append(len(df))
    
    # Printing the ASCII table
    print("+"+"-"*52+"+"+"--------+")
    print("| File"+" "*47+"| Length |")
    print("+"+"-"*52+"+"+"--------+")
    for file, length in zip(keywords_date, lengths):
        print(f"| {file:50} | {length:6} |")
    print("+"+"-"*52+"+"+"--------+")


Dates: {'20240218', '20240217', '20240220'}
 --- 10 Keywords for 20240218 : ['signal processing_20240218.csv', 'sigma_20240218.csv', 'python_20240218.csv', 'prompt engineer_20240218.csv', 'openai_20240218.csv', 'natural language_20240218.csv', 'künstliche intelligenz_20240218.csv', 'database_20240218.csv', 'artificial intelligence_20240218.csv', 'ai_20240218.csv']
+----------------------------------------------------+--------+
| File                                               | Length |
+----------------------------------------------------+--------+
| signal processing_20240218.csv                     |      1 |
| sigma_20240218.csv                                 |     95 |
| python_20240218.csv                                |     42 |
| prompt engineer_20240218.csv                       |      0 |
| openai_20240218.csv                                |      0 |
| natural language_20240218.csv                      |      3 |
| künstliche intelligenz_20240218.csv                |   

In [None]:
# Create SIGMA for date - concatenate all csv files into one by date    

now = datetime.datetime.now()   
date = now.strftime("%Y%m%d")


search_str = f"results/*{date}.csv"
file_names = glob.glob("results/*.csv")

sigma_date = pd.DataFrame(columns=[ "job_url",
    "site", "title", "company", "company_url", "location", "job_type",
    "date_posted", "interval", "min_amount", "max_amount", "currency",
    "is_remote", "num_urgent_words", "benefits", "emails", "description"])

for file in file_names:
    if date in file:
        df = pd.read_csv(file)
        sigma_date = pd.concat([sigma_date, df], ignore_index=True)

# remove duplicates
sigma_date = sigma_date.drop_duplicates(subset=["job_url"], keep="first")
sigma_date = sigma_date.drop_duplicates(subset=["title", "company"], keep="first")


# drop description column

sigma_date = sigma_date.drop(columns=["description"])

sigma_date.to_csv(f"results/sigma_{date}.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
print(sigma_date.info())



In [None]:

# determine the latest sigma file, remove it from the list, concatenate the rest
sigma_files = glob.glob("results/sigma_*.csv")
sigma_dates = [file.split("_")[1].split(".")[0] for file in sigma_files]
sigma_dates = [int(date) for date in sigma_dates]

latest_sigma_date = max(sigma_dates)
latest_sigma_path = f"results/sigma_{latest_sigma_date}.csv"
sigma_files = [file for file in sigma_files if file != latest_sigma_path]

sigma_latest = pd.read_csv(latest_sigma_path)
sigma_prev = pd.DataFrame(columns=[ "job_url",
    "site", "title", "company", "company_url", "location", "job_type",
    "date_posted", "interval", "min_amount", "max_amount", "currency",
    "is_remote", "num_urgent_words", "benefits", "emails", "description"])

for file in sigma_files:
    df = pd.read_csv(file)
    sigma_prev = pd.concat([sigma_prev, df], ignore_index=True)

# delete all entries from sigma latest that are also in sigma prev
sigma = sigma_latest[~sigma_latest["job_url"].isin(sigma_prev["job_url"])]

print(f"Latest sigma entries: {len(sigma_latest)}")
print(f"New jobs found: {len(sigma)}")

cli_in = input("Open all job urls? (y/n): ")
if cli_in == "y":

    for url in sigma["job_url"]:
        os.system(f"open {url}")
else:
    print("Not opening job urls")


In [None]:
# use if hyperlinks=True
html = jobs.to_html(escape=False)
# change max-width: 200px to show more or less of the content
truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'
display(HTML(truncate_width))

In [None]:
#Dough, Sauce ,Toppings - 3 Pizzaz