In [1]:
from jobspy import scrape_jobs
import pandas as pd
from IPython.display import display, HTML
import csv, datetime, time, os, json
import random, glob
from pprint import pprint

# Hybrid(office) proximity
# --- Later ---
# LLM ethos check
# logging

"""
 #  
    usig date folders with kw as csv files to denote states (ACID-drops) and progress
    search config kw might change over time which is why we will store the kw in csv files

    - users
        - name and email
        - jobs_configs
        - result_jobs - dates[keywords], sigma_raw (csv), sigma_quo(function) 
        - result_stats - eg. ger_nlp_remote, lux_bwl, sigma
        
    

        
-> Stats - Per job_config & Per User
         - 

        kw1     kw2     ...    kwN     

date1

date2

...

dateN

    
"""



In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

In [None]:
# Import user configs

def insert_user(user, email):
    
    if not os.path.exists(f'users/{user}'):

        os.mkdir(f'users/{user}')
        user_dict = {'user': user, 'email': email}
        config_path = os.path.join(f'users/{user}', f'{user}_config.json')
        with open(config_path, 'w') as f:
            json.dump(user_dict, f, indent=4)

    else :
        print(f'User {user} already exists')


#insert_user('tdawg', 'test@test.de')


basic_PJ = {
    "platform":"indeed", 
    "location": "England-remote", 
    "kewwords": ["python", "data"],
            
    "days_left": 7,
    "interval": "daily"
    }


user_config_paths = glob.glob('users/**/*.json')
print(user_config_paths)

user_configs = []
for user_config_path in user_config_paths:
    with open(user_config_path) as f:
        tmp_dict = json.load(f)
        user_configs.append(tmp_dict)

for user in user_configs:
   print(user)



def order_PJ(user, P):
    pass

def show_stats():
    pass


In [6]:


keywords = ['python', 'database', 'nlp','natural language',
            'signal processing', 'prompt engineer', 'openai', 'language model', 'generative model',
            'genai','ki', 'künstliche intelligenz', 'ai', 'artificial intelligence']

def update_keywords_left(keywords: list, date: str) -> list:
    
    files = glob.glob("results/*.csv")
    files = [os.path.basename(file) for file in files]
    keywords_done = [keyword for keyword in keywords if f"{keyword}_{date}.csv" in files]
    keywords_left = [keyword for keyword in keywords if keyword not in keywords_done]

    print(f"--- Date: {date} ---")
    print(f"Keywords done: {len(keywords_done)} : {keywords_done}")
    print(f"Keywords left: {len(keywords_left)} : {keywords_left}")
    return keywords_left

def get_jobs(keywords: list) -> None:

    now = datetime.datetime.now()
    date = now.strftime("%Y%m%d")

    keywords_left = update_keywords_left(keywords, date)
    
    proxy_path = os.path.join("results", "proxy.txt")
    with open(proxy_path, "r") as f:
        proxy = f.read().strip()
    
    empty_jobs = pd.DataFrame(columns=[ "job_url",
    "site", "title", "company", "company_url", "location", "job_type",
    "date_posted", "interval", "min_amount", "max_amount", "currency",
    "is_remote", "num_urgent_words", "benefits", "emails", "description"])
    jobs = empty_jobs

    while keywords_left:
    
        keyword = random.choice(keywords_left)
        kw_path = os.path.join("results", f"{keyword}_{date}.csv")

        print(f"Keyword: *** {keyword} *** -> Starting search...")

        try:
            jobs = scrape_jobs(
                site_name=["indeed"],

                search_term=keyword,
                proxy=proxy,

                hours_old=72,
                #is_remote=True, # Lux does not seem to work with remote
                results_wanted=500,
                country_indeed='luxembourg'#'germany'  # only needed for indeed / glassdoor
            )

        except Exception as e:
            print(f"Error with keyword: {keyword}")
            print(e)

            if "Bad proxy" in str(e):
                # if the proxy is not working, the program will never advance, so we stop it.
                print("Bad proxy, stopping the program")

                " TODO: Outbound email to notify the admin, maybe change proxy"
                break

            # There is also : "HTTPSConnectionPool(host='apis.indeed.com', port=443): Max retries 
            # exceeded with url: /graphql (Caused by ProxyError('Unable to connect to proxy', 
            # RemoteDisconnected('Remote end closed connection without response')))" Error but
            # but the file being written is not necessarily empty.

            continue
        
        
        print(f"Number of jobs found: {len(jobs)}")
        jobs = jobs.drop_duplicates(subset=["job_url"], keep="first")
        # listing with the same title from the same company are considered duplicates, because
        # they are usually the same job offer, just posted multiple times for different locations
        jobs = jobs.drop_duplicates(subset=["title", "company"], keep="first")
        print(f"Number of (unique) jobs found: {len(jobs)}")

        if jobs.empty:
            print(f"No jobs found for keyword: {keyword}. Writing empty file...")
            empty_jobs.to_csv(kw_path, quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
            
        else:
            print(f"Writing jobs to file: {kw_path}")
            jobs.to_csv(kw_path, quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
            jobs = empty_jobs
        
        keywords_left = update_keywords_left(keywords, date)

get_jobs(keywords)

--- Date: 20240227
Keywords done: 0 : []
Keywords left: 14 : ['python', 'database', 'nlp', 'natural language', 'signal processing', 'prompt engineer', 'openai', 'language model', 'generative model', 'genai', 'ki', 'künstliche intelligenz', 'ai', 'artificial intelligence']
Keyword: *** language model *** -> Starting search...
Number of jobs found: 13
Number of (unique) jobs found: 12
Writing jobs to file: results/language model_20240227.csv
--- Date: 20240227
Keywords done: 1 : ['language model']
Keywords left: 13 : ['python', 'database', 'nlp', 'natural language', 'signal processing', 'prompt engineer', 'openai', 'generative model', 'genai', 'ki', 'künstliche intelligenz', 'ai', 'artificial intelligence']
Keyword: *** ki *** -> Starting search...
Number of jobs found: 0
Number of (unique) jobs found: 0
No jobs found for keyword: ki. Writing empty file...
--- Date: 20240227
Keywords done: 2 : ['language model', 'ki']
Keywords left: 12 : ['python', 'database', 'nlp', 'natural language', 

2024-02-27 12:29:04,301 - JobSpy - ERROR - Indeed response status code 403
  jobs_df = pd.concat(jobs_dfs, ignore_index=True)


Number of jobs found: 12
Number of (unique) jobs found: 11
Writing jobs to file: results/python_20240227.csv
--- Date: 20240227
Keywords done: 9 : ['python', 'nlp', 'signal processing', 'openai', 'language model', 'generative model', 'genai', 'ki', 'künstliche intelligenz']
Keywords left: 5 : ['database', 'natural language', 'prompt engineer', 'ai', 'artificial intelligence']
Keyword: *** natural language *** -> Starting search...
Number of jobs found: 1
Number of (unique) jobs found: 1
Writing jobs to file: results/natural language_20240227.csv
--- Date: 20240227
Keywords done: 10 : ['python', 'nlp', 'natural language', 'signal processing', 'openai', 'language model', 'generative model', 'genai', 'ki', 'künstliche intelligenz']
Keywords left: 4 : ['database', 'prompt engineer', 'ai', 'artificial intelligence']
Keyword: *** artificial intelligence *** -> Starting search...
Error with keyword: artificial intelligence
HTTPSConnectionPool(host='apis.indeed.com', port=443): Max retries exc

2024-02-27 12:30:59,952 - JobSpy - ERROR - Indeed response status code 403
  jobs_df = pd.concat(jobs_dfs, ignore_index=True)


Number of jobs found: 4
Number of (unique) jobs found: 4
Writing jobs to file: results/artificial intelligence_20240227.csv
--- Date: 20240227
Keywords done: 13 : ['python', 'nlp', 'natural language', 'signal processing', 'prompt engineer', 'openai', 'language model', 'generative model', 'genai', 'ki', 'künstliche intelligenz', 'ai', 'artificial intelligence']
Keywords left: 1 : ['database']
Keyword: *** database *** -> Starting search...
Error with keyword: database
HTTPSConnectionPool(host='apis.indeed.com', port=443): Max retries exceeded with url: /graphql (Caused by SSLError(SSLError(1, '[SSL: DECRYPTION_FAILED_OR_BAD_RECORD_MAC] decryption failed or bad record mac (_ssl.c:1007)')))
Keyword: *** database *** -> Starting search...
Number of jobs found: 7
Number of (unique) jobs found: 7
Writing jobs to file: results/database_20240227.csv
--- Date: 20240227
Keywords done: 14 : ['python', 'database', 'nlp', 'natural language', 'signal processing', 'prompt engineer', 'openai', 'langua

  jobs_df = pd.concat(jobs_dfs, ignore_index=True)


In [8]:
# check dates and show results - meta table

file_paths = glob.glob("results/*.csv")

file_names = [os.path.basename(file) for file in file_paths]
file_names = sorted(file_names, reverse=True)

dates = set([file.split("_")[1].split(".")[0] for file in file_names])
print(f"Dates: {dates}")

for input_date in dates:

    keywords_date = [file for file in file_names if input_date in file]
    length = len(keywords_date)
    lengths = []

    print(f" --- {length} Keywords for {input_date} : {keywords_date}")

    # print the length of the dataframes vertically in table format

    for file in keywords_date:
        df = pd.read_csv(os.path.join("results", file))
        lengths.append(len(df))
    
    # Printing the ASCII table
    print("+"+"-"*52+"+"+"--------+")
    print("| File"+" "*47+"| Length |")
    print("+"+"-"*52+"+"+"--------+")
    for file, length in zip(keywords_date, lengths):
        print(f"| {file:50} | {length:6} |")
    print("+"+"-"*52+"+"+"--------+")


Dates: {'20240227'}
 --- 15 Keywords for 20240227 : ['signal processing_20240227.csv', 'sigma_20240227.csv', 'python_20240227.csv', 'prompt engineer_20240227.csv', 'openai_20240227.csv', 'nlp_20240227.csv', 'natural language_20240227.csv', 'language model_20240227.csv', 'künstliche intelligenz_20240227.csv', 'ki_20240227.csv', 'generative model_20240227.csv', 'genai_20240227.csv', 'database_20240227.csv', 'artificial intelligence_20240227.csv', 'ai_20240227.csv']
+----------------------------------------------------+--------+
| File                                               | Length |
+----------------------------------------------------+--------+
| signal processing_20240227.csv                     |      0 |
| sigma_20240227.csv                                 |     31 |
| python_20240227.csv                                |     11 |
| prompt engineer_20240227.csv                       |      0 |
| openai_20240227.csv                                |      0 |
| nlp_20240227.csv  

In [7]:
# Create SIGMA for date - concatenate all csv files into one by date    

now = datetime.datetime.now()   
date = now.strftime("%Y%m%d")


search_str = f"results/*{date}.csv"
file_names = glob.glob("results/*.csv")

sigma_date = pd.DataFrame(columns=[ "job_url",
    "site", "title", "company", "company_url", "location", "job_type",
    "date_posted", "interval", "min_amount", "max_amount", "currency",
    "is_remote", "num_urgent_words", "benefits", "emails", "description"])

for file in file_names:
    if date in file:
        df = pd.read_csv(file)
        sigma_date = pd.concat([sigma_date, df], ignore_index=True)

# remove duplicates
sigma_date = sigma_date.drop_duplicates(subset=["job_url"], keep="first")
sigma_date = sigma_date.drop_duplicates(subset=["title", "company"], keep="first")


# drop description column

sigma_date = sigma_date.drop(columns=["description"])

sigma_date.to_csv(f"results/sigma_{date}.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
print(sigma_date.info())



<class 'pandas.core.frame.DataFrame'>
Index: 31 entries, 0 to 33
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_url           31 non-null     object 
 1   site              31 non-null     object 
 2   title             31 non-null     object 
 3   company           31 non-null     object 
 4   company_url       31 non-null     object 
 5   location          31 non-null     object 
 6   job_type          15 non-null     object 
 7   date_posted       31 non-null     object 
 8   interval          2 non-null      object 
 9   min_amount        2 non-null      float64
 10  max_amount        2 non-null      float64
 11  currency          2 non-null      object 
 12  is_remote         31 non-null     object 
 13  num_urgent_words  20 non-null     float64
 14  benefits          0 non-null      object 
 15  emails            1 non-null      object 
dtypes: float64(3), object(13)
memory usage: 4.1+ KB
Non

  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)
  sigma_date = pd.concat([sigma_date, df], ignore_index=True)


In [9]:

# determine the latest sigma file, remove it from the list, concatenate the rest
sigma_files = glob.glob("results/sigma_*.csv")
sigma_dates = [file.split("_")[1].split(".")[0] for file in sigma_files]
sigma_dates = [int(date) for date in sigma_dates]

latest_sigma_date = max(sigma_dates)
latest_sigma_path = f"results/sigma_{latest_sigma_date}.csv"
sigma_files = [file for file in sigma_files if file != latest_sigma_path]

sigma_latest = pd.read_csv(latest_sigma_path)
sigma_prev = pd.DataFrame(columns=[ "job_url",
    "site", "title", "company", "company_url", "location", "job_type",
    "date_posted", "interval", "min_amount", "max_amount", "currency",
    "is_remote", "num_urgent_words", "benefits", "emails", "description"])

for file in sigma_files:
    df = pd.read_csv(file)
    sigma_prev = pd.concat([sigma_prev, df], ignore_index=True)

# delete all entries from sigma latest that are also in sigma prev
sigma = sigma_latest[~sigma_latest["job_url"].isin(sigma_prev["job_url"])]

print(f"Latest sigma entries: {len(sigma_latest)}")
print(f"New jobs found: {len(sigma)}")

cli_in = input("Open all job urls? (y/n): ")
if cli_in == "y":

    for url in sigma["job_url"]:
        os.system(f"open {url}")
else:
    print("Not opening job urls")


Latest sigma entries: 31
New jobs found: 31
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.


Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.


In [None]:
# use if hyperlinks=True
html = jobs.to_html(escape=False)
# change max-width: 200px to show more or less of the content
truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'
display(HTML(truncate_width))

In [None]:
#Dough, Sauce ,Toppings - 3 Pizzaz