In [2]:
from jobspy import scrape_jobs
import pandas as pd
from IPython.display import display, HTML
import csv, datetime, time, os
import random, glob

# Hybrid(office) proximity
# --- Later ---
# LLM ethos check


In [29]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

In [30]:


keywords = ['python', 'database', 'nlp','natural language',
            'signal processing', 'prompt engineer', 'openai', 'language model', 'generative model',
            'genai','ki', 'künstliche intelligenz', 'ai', 'artificial intelligence']

def update_keywords_left(keywords: list, date: str) -> list:
    
    files = glob.glob("results/*.csv")
    files = [os.path.basename(file) for file in files]
    keywords_done = [keyword for keyword in keywords if f"{keyword}_{date}.csv" in files]
    keywords_left = [keyword for keyword in keywords if keyword not in keywords_done]

    print(f"Date: {date}")
    print(f"Keywords done: {len(keywords_done)} : {keywords_done}")
    print(f"Keywords left: {len(keywords_left)} : {keywords_left}")
    return keywords_left

def get_jobs(keywords: list) -> None:

    now = datetime.datetime.now()
    date = now.strftime("%Y%m%d")

    keywords_left = update_keywords_left(keywords, date)
    
    proxy_path = os.path.join("results", "proxy.txt")
    with open(proxy_path, "r") as f:
        proxy = f.read().strip()
    
    empty_jobs = pd.DataFrame(columns=[ "job_url",
    "site", "title", "company", "company_url", "location", "job_type",
    "date_posted", "interval", "min_amount", "max_amount", "currency",
    "is_remote", "num_urgent_words", "benefits", "emails", "description"])
    
    while keywords_left:
    
        keyword = random.choice(keywords_left)
        kw_path = os.path.join("results", f"{keyword}_{date}.csv")

        print(f"Keyword: {keyword} -> Starting search...")

        try:
            jobs = scrape_jobs(
                site_name=["indeed"],

                search_term=keyword,
                proxy=proxy,

                hours_old=72,
                is_remote=True,
                results_wanted=500,
                country_indeed='germany'  # only needed for indeed / glassdoor
            )

        except Exception as e:
            print(f"Error with keyword: {keyword}")
            print(e)

            if "Bad proxy" in str(e):
                # stop the program if the proxy is bad
                print("Bad proxy, stopping the program")
                break

            if "Could not find any results for the search" in str(e):
                empty_jobs.to_csv(kw_path, quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
                keywords_left = update_keywords_left(keywords, date)
            continue

        jobs=jobs.drop_duplicates(subset=["job_url"], keep="first")
        # listing with the same title from the same company are considered duplicates, because
        # they are usually the same job offer, just posted multiple times for different locations
        jobs=jobs.drop_duplicates(subset=["title", "company"], keep="first")

        print(f"Number of (unique) jobs found: {len(jobs)}")

        
        jobs.to_csv(kw_path, quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
        keywords_left = update_keywords_left(keywords, date)

get_jobs(keywords)

Date: 20240220
Keywords done: 0 : []
Keywords left: 14 : ['python', 'database', 'nlp', 'natural language', 'signal processing', 'prompt engineer', 'openai', 'language model', 'generative model', 'genai', 'ki', 'künstliche intelligenz', 'ai', 'artificial intelligence']
Keyword: database -> Starting search...


  jobs_df = pd.concat(jobs_dfs, ignore_index=True)


Number of (unique) jobs found: 12
Date: 20240220
Keywords done: 1 : ['database']
Keywords left: 13 : ['python', 'nlp', 'natural language', 'signal processing', 'prompt engineer', 'openai', 'language model', 'generative model', 'genai', 'ki', 'künstliche intelligenz', 'ai', 'artificial intelligence']
Keyword: natural language -> Starting search...
Error with keyword: natural language
Could not find any results for the search
Date: 20240220
Keywords done: 2 : ['database', 'natural language']
Keywords left: 12 : ['python', 'nlp', 'signal processing', 'prompt engineer', 'openai', 'language model', 'generative model', 'genai', 'ki', 'künstliche intelligenz', 'ai', 'artificial intelligence']
Keyword: generative model -> Starting search...
Error with keyword: generative model
Could not find any results for the search
Date: 20240220
Keywords done: 3 : ['database', 'natural language', 'generative model']
Keywords left: 11 : ['python', 'nlp', 'signal processing', 'prompt engineer', 'openai', 'la

  jobs_df = pd.concat(jobs_dfs, ignore_index=True)


Number of (unique) jobs found: 27
Date: 20240220
Keywords done: 9 : ['python', 'database', 'nlp', 'natural language', 'signal processing', 'openai', 'language model', 'generative model', 'ai']
Keywords left: 5 : ['prompt engineer', 'genai', 'ki', 'künstliche intelligenz', 'artificial intelligence']
Keyword: genai -> Starting search...
Error with keyword: genai
Could not find any results for the search
Date: 20240220
Keywords done: 10 : ['python', 'database', 'nlp', 'natural language', 'signal processing', 'openai', 'language model', 'generative model', 'genai', 'ai']
Keywords left: 4 : ['prompt engineer', 'ki', 'künstliche intelligenz', 'artificial intelligence']
Keyword: künstliche intelligenz -> Starting search...


  jobs_df = pd.concat(jobs_dfs, ignore_index=True)


Number of (unique) jobs found: 16
Date: 20240220
Keywords done: 11 : ['python', 'database', 'nlp', 'natural language', 'signal processing', 'openai', 'language model', 'generative model', 'genai', 'künstliche intelligenz', 'ai']
Keywords left: 3 : ['prompt engineer', 'ki', 'artificial intelligence']
Keyword: artificial intelligence -> Starting search...
Number of (unique) jobs found: 2
Date: 20240220
Keywords done: 12 : ['python', 'database', 'nlp', 'natural language', 'signal processing', 'openai', 'language model', 'generative model', 'genai', 'künstliche intelligenz', 'ai', 'artificial intelligence']
Keywords left: 2 : ['prompt engineer', 'ki']
Keyword: prompt engineer -> Starting search...
Error with keyword: prompt engineer
Could not find any results for the search
Date: 20240220
Keywords done: 13 : ['python', 'database', 'nlp', 'natural language', 'signal processing', 'prompt engineer', 'openai', 'language model', 'generative model', 'genai', 'künstliche intelligenz', 'ai', 'arti

  jobs_df = pd.concat(jobs_dfs, ignore_index=True)


In [33]:
# check dates and show results - meta table

file_paths = glob.glob("results/*.csv")

file_names = [os.path.basename(file) for file in file_paths]
file_names = sorted(file_names, reverse=True)

dates = set([file.split("_")[1].split(".")[0] for file in file_names])
print(f"Dates: {dates}")

for input_date in dates:

    keywords_date = [file for file in file_names if input_date in file]
    length = len(keywords_date)
    lengths = []

    print(f" --- {length} Keywords for {input_date} : {keywords_date}")

    # print the length of the dataframes vertically in table format

    for file in keywords_date:
        df = pd.read_csv(os.path.join("results", file))
        lengths.append(len(df))
    
    # Printing the ASCII table
    print("+"+"-"*52+"+"+"--------+")
    print("| File"+" "*47+"| Length |")
    print("+"+"-"*52+"+"+"-------+")
    for file, length in zip(keywords_date, lengths):
        print(f"| {file:50} | {length:5} |")
    print("+"+"-"*52+"+"+"--------+")


"""
# Assuming file_names and dates are defined

file_names = ["example_20200101.csv", "sample_20200102.csv"]  # Example file names
dates = ["20200101", "20200102"]  # Example dates

for input_date in dates:
    keywords_date = [file for file in file_names if input_date in file]
    lengths = []

    for file in keywords_date:
        df = pd.read_csv(os.path.join("results", file))
        lengths.append(len(df))
    
    # Printing the ASCII table
    print(f" --- {len(keywords_date)} Keywords for {input_date} : {keywords_date}")
    print("+------------+--------+")
    print("| File       | Length |")
    print("+------------+--------+")
    for file, length in zip(keywords_date, lengths):
        print(f"| {file:} | {length:6} |")
    print("+------------+--------+")

"""

Dates: {'20240217', '20240220', '20240218'}
 --- 15 Keywords for 20240217 : ['signal processing_20240217.csv', 'sigma_20240217.csv', 'python_20240217.csv', 'prompt engineer_20240217.csv', 'openai_20240217.csv', 'nlp_20240217.csv', 'natural language_20240217.csv', 'language model_20240217.csv', 'künstliche intelligenz_20240217.csv', 'ki_20240217.csv', 'generative model_20240217.csv', 'genai_20240217.csv', 'database_20240217.csv', 'artificial intelligence_20240217.csv', 'ai_20240217.csv']
+----------------------------------------------------+--------+
| File                                               | Length |
+----------------------------------------------------+-------+
| signal processing_20240217.csv                     |     1 |
| sigma_20240217.csv                                 |   143 |
| python_20240217.csv                                |    61 |
| prompt engineer_20240217.csv                       |     1 |
| openai_20240217.csv                                |     0 |
| 

'\n# Assuming file_names and dates are defined\n\nfile_names = ["example_20200101.csv", "sample_20200102.csv"]  # Example file names\ndates = ["20200101", "20200102"]  # Example dates\n\nfor input_date in dates:\n    keywords_date = [file for file in file_names if input_date in file]\n    lengths = []\n\n    for file in keywords_date:\n        df = pd.read_csv(os.path.join("results", file))\n        lengths.append(len(df))\n    \n    # Printing the ASCII table\n    print(f" --- {len(keywords_date)} Keywords for {input_date} : {keywords_date}")\n    print("+------------+--------+")\n    print("| File       | Length |")\n    print("+------------+--------+")\n    for file, length in zip(keywords_date, lengths):\n        print(f"| {file:} | {length:6} |")\n    print("+------------+--------+")\n\n'

In [31]:
# Create SIGMA for date - concatenate all csv files into one by date    

now = datetime.datetime.now()   
date = now.strftime("%Y%m%d")


search_str = f"results/*{date}.csv"
file_names = glob.glob("results/*.csv")

sigma_date = pd.DataFrame(columns=[ "job_url",
    "site", "title", "company", "company_url", "location", "job_type",
    "date_posted", "interval", "min_amount", "max_amount", "currency",
    "is_remote", "num_urgent_words", "benefits", "emails", "description"])

for file in file_names:
    if date in file:
        df = pd.read_csv(file)
        sigma_date = pd.concat([sigma_date, df], ignore_index=True)

# remove duplicates
sigma_date = sigma_date.drop_duplicates(subset=["job_url"], keep="first")
sigma_date = sigma_date.drop_duplicates(subset=["title", "company"], keep="first")


# drop description column

sigma_date = sigma_date.drop(columns=["description"])

sigma_date.to_csv(f"results/sigma_{date}.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
print(sigma_date.info())



<class 'pandas.core.frame.DataFrame'>
Index: 96 entries, 0 to 117
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_url           96 non-null     object 
 1   site              96 non-null     object 
 2   title             96 non-null     object 
 3   company           96 non-null     object 
 4   company_url       92 non-null     object 
 5   location          96 non-null     object 
 6   job_type          62 non-null     object 
 7   date_posted       96 non-null     object 
 8   interval          13 non-null     object 
 9   min_amount        13 non-null     float64
 10  max_amount        13 non-null     float64
 11  currency          13 non-null     object 
 12  is_remote         96 non-null     object 
 13  num_urgent_words  61 non-null     float64
 14  benefits          0 non-null      float64
 15  emails            13 non-null     object 
dtypes: float64(4), object(12)
memory usage: 12.8+ KB
N

  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)


In [32]:

# determine the latest sigma file, remove it from the list, concatenate the rest
sigma_files = glob.glob("results/sigma_*.csv")
sigma_dates = [file.split("_")[1].split(".")[0] for file in sigma_files]
sigma_dates = [int(date) for date in sigma_dates]

latest_sigma_date = max(sigma_dates)
latest_sigma_path = f"results/sigma_{latest_sigma_date}.csv"
sigma_files = [file for file in sigma_files if file != latest_sigma_path]

sigma_latest = pd.read_csv(latest_sigma_path)
sigma_prev = pd.DataFrame(columns=[ "job_url",
    "site", "title", "company", "company_url", "location", "job_type",
    "date_posted", "interval", "min_amount", "max_amount", "currency",
    "is_remote", "num_urgent_words", "benefits", "emails", "description"])

for file in sigma_files:
    df = pd.read_csv(file)
    sigma_prev = pd.concat([sigma_prev, df], ignore_index=True)

# delete all entries from sigma latest that are also in sigma prev
sigma = sigma_latest[~sigma_latest["job_url"].isin(sigma_prev["job_url"])]

print(f"Latest sigma entries: {len(sigma_latest)}")
print(f"New jobs found: {len(sigma)}")

input = input("Open all job urls? (y/n): ")
if input == "y":

    for url in sigma["job_url"]:
        os.system(f"open {url}")
else:
    print("Not opening job urls")


  sigma_prev = pd.concat([sigma_prev, df], ignore_index=True)


Latest sigma entries: 96
New jobs found: 86
Not opening job urls


In [None]:
# use if hyperlinks=True
html = jobs.to_html(escape=False)
# change max-width: 200px to show more or less of the content
truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'
display(HTML(truncate_width))

In [None]:
#Dough, Sauce ,Toppings - 3 Pizzaz