In [12]:
import json, glob, os
import random, datetime

import logging
import pandas as pd
import csv

from jobspy import scrape_jobs


In [None]:
"""

ids are unique, global static

RUN
 -  GET USER CONFIGS
 -  GET DATE

 - FOR EACH USER CONFIG

    - For each search_setting

        - For each KW

            - Find Jobs


Agent Fisher
    - run tru all user, theri settings, and their keywords
    - save the results in their corresponding folders

Agent Perry
    - ELT the data - save to the correct folder
    - save the results in their corresponding folders
    
    
    - Generate Reports and Stats


    LOGGING

    - folder structure does a lot for logging
    - log all errors and warnings - and the files that triggered them
    - saving files (advace state) is very permisive as of now.
    - Empty files are not saved (they probably should be though)

    -----------

    - Log all errors and warnings
    - Log all successful runs
    - Log all failed runs

     [RUN-LOGS-(package)-E-Mail]

     encypted zip - stats.csv, sigma_quo.txt, 

     - polymorphic/other dp, encap, graph inherit


"""

In [13]:
class Agent_Perry():

    """
    
    in-file redudancy - duplicates, cross time
    

    # listing with the same title from the same company are considered duplicates, because
        # they are usually the same job offer, just posted multiple times for different locations
        jobs = jobs.drop_duplicates(subset=["title", "company"], keep="first")
        print(f"Number of (unique) jobs found: {len(jobs)}")

    
    """

    def filter_jobs(jobs: pd.DataFrame, search_setting: dict) -> pd.DataFrame:
            pass
    pass

    pass

class Agent_Krieger():
    pass



class Agent_Fisher():
    def __init__(self, proxy_path: str):

        """
        Agent Fisher is the muscle of the operation. It is responsible for running the scrapes.

        get_user_configs - returns a list of dictionaries, each dictionary is a user config.
        get_proxy - returns a string of the proxy to be passed to the scraper as a string.
        get_date - returns a string of the date in the format YYYYMMDD.
        Date, user and search_settings(stored in user_configs - eg. IT jobs in london, 
        remote teaching jobs in Germany). 
        """
        
        self.date_run = self.get_date()
        self.proxy = self.get_proxy()
        self.user_configs = self.get_user_configs()

    # --- INIT Functions --- #

    def get_date() -> str:
        now = datetime.datetime.now()
        date = now.strftime("%Y%m%d")
        
        return date
    
    def get_user_configs() -> list[dict]: 

        user_config_paths = glob.glob('users/**/*.json')
        print(f" ---> Config file paths found : {user_config_paths}")
        user_configs = []

        for user_config_path in user_config_paths:
            with open(user_config_path) as f:
                tmp_dict = json.load(f)
                user_configs.append(tmp_dict)
        
        return user_configs

    def get_proxy() -> str:
        
        proxy_path = os.path.join("results", "proxy.txt")
        with open(proxy_path, "r") as f:
            proxy = f.read().strip()

        return proxy


    # --- SEARCH --- #

    def update_keywords_left(kewords_for_this_search: list, ss_path: str) -> list[str]:

        """
        This function checks which keywords have already been run and determines the state of the run.
        If a keyword has already been run (clean, no exception thrown) it is removed from the list (state).
        Exceptions are there to handle the case where the run was not completed, incomplete or the proxy
        was blocked.

        ss_path: str - user/date/search_setting - path to the search setting folder

        Note - the order of the keywods run is picked randomly. To spice things up a bit ...
        """
        
        files = glob.glob(f"{ss_path}/*.csv") ### os path join
        files = [os.path.basename(file) for file in files]

        keywords_done = [keyword for keyword in kewords_for_this_search if f"{keyword}_{date}.csv" in files]
        keywords_left = [keyword for keyword in kewords_for_this_search if keyword not in keywords_done]

        print(f"--- Path: {f"{ss_path}/*.csv"} ---")
        print(f"Keywords done: {len(keywords_done)} : {keywords_done}")
        print(f"Keywords left: {len(keywords_left)} : {keywords_left}")
        return keywords_left



    def run_search_setting(self, username: str, search_setting: dict, date: str) -> None:
            
        empty_jobs = pd.DataFrame(columns=[ "job_url",
        "site", "title", "company", "company_url", "location", "job_type",
        "date_posted", "interval", "min_amount", "max_amount", "currency",
        "is_remote", "num_urgent_words", "benefits", "emails", "description"])

        keywords = search_setting['keywords']
        jobs = empty_jobs

        ss_path = os.path.join("users",username, date, search_setting['name'])
        keywords_left = self.update_keywords_left(keywords, ss_path)

        while keywords_left:

            keyword = random.choice(keywords_left)
            kw_path = os.path.join(ss_path, f"{keyword}.csv")

            print(f"Keyword: *** {keyword} *** -> Starting search...")

            try:
                jobs = scrape_jobs(
                    site_name=search_setting['site_name'],

                    search_term=keyword,
                    proxy=self.proxy,

                    hours_old=search_setting['hours_old'],
                    is_remote=search_setting['is_remote'],
                    results_wanted=search_setting['results_wanted'],
                    country_indeed=search_setting['country_indeed']  # only needed for indeed / glassdoor
                )

            except Exception as e:
                print(f"Error with keyword: {keyword}")
                print(e)

                if "Bad proxy" in str(e):
                    # if the proxy is not working, the program will never advance, so we stop it.
                    print("Bad proxy, stopping the program")

                    " TODO: Outbound email to notify the admin, maybe change proxy"
                    break
                    return false

                # There is also : "HTTPSConnectionPool(host='apis.indeed.com', port=443): Max retries 
                # exceeded with url: /graphql (Caused by ProxyError('Unable to connect to proxy', 
                # RemoteDisconnected('Remote end closed connection without response')))" Error but
                # but the file being written is not necessarily empty.

                continue
            
            print(f"Number of jobs found: {len(jobs)}")
            jobs = jobs.drop_duplicates(subset=["job_url"], keep="first")
            
            if jobs.empty:
                print(f"No jobs found for keyword: {keyword}. Writing empty file...")
                empty_jobs.to_csv(kw_path, quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
                
            else:
                print(f"Writing jobs to file: {kw_path}")
                jobs.to_csv(kw_path, quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
                jobs = empty_jobs

            keywords_left = self.update_keywords_left(keywords, ss_path)
        
        return True

    # --- Overarching --- #

    def run_user(self) -> None:
        pass

    def run_all_users(self):

        date = self.date_run
        user_configs = self.user_configs

        proxy_path = os.path.join("results", "proxy.txt")
        with open(proxy_path, "r") as f:
            proxy = f.read().strip()

        print(f" ---> Date : {date}")
        pass



def send_logs() -> None:
    pass

def send_email() -> None:
    pass

def run_all_users(self):

    date = self.date_run()
    user_configs = self.user_configs()

    proxy_path = os.path.join("results", "proxy.txt")
    with open(proxy_path, "r") as f:
        proxy = f.read().strip()

    print(f" ---> Date : {date}")
    pass

 ---> Config file paths found : ['users/tdawg/tdawg_config.json', 'users/lucky/lucky_config.json']


[{'user': 'tdawg', 'email': 'test@test.de'},
 {'user': 'lucky',
  'email': 's4mipojo@uni-trier.de',
  'jobs': [{'id': 1,
    'name': 'lux_py',
    'site_name': ['indeed'],
    'search_terms': ['', 'python', 'database'],
    'results_wanted': 500,
    'hours_old': 72,
    'is_remote': False,
    'country_indeed': 'luxembourg',
    'proxy': 'proxy',
    'run_count': 0,
    'run_left': 0,
    'created': '2016-01-01T00:00:00Z',
    'updated': '2016-01-01T00:00:00Z'},
   {'id': 2,
    'name': 'ger_py',
    'site_name': ['indeed'],
    'search_terms': ['', 'python', 'database'],
    'results_wanted': 500,
    'hours_old': 72,
    'is_remote': True,
    'country_indeed': 'germany',
    'proxy': 'proxy',
    'status': 'running',
    'created': '2016-01-01T00:00:00Z',
    'updated': '2016-01-01T00:00:00Z'}]}]