In [1]:
import pandas as pd
import numpy as np

from methods import sites, files_io, scrape, urls, postings_utils
from methods.macros import *
from schedule import tech_collect, tech_filter

path = f"{RELATIVE_POSTINGS_PATH}/tech/"

#### Option A: gather new data by running the script

In [None]:
info = tech_collect.main()
current = info["results"]
newly_added = info["added"]

Collecting tech job postings...
Found 663 postings in total

Keyword counts in descriptions:
{'graph data': 2, 'time series': 1, 'operations research': 2, 'graph': 30, 'algorithm': 16, 'sensor': 39, 'spatial': 1, 'geospatial': 0, 'geographical': 0, 'machine learning': 50, 'science': 0, 'analytics': 73, 'radar': 4, 'lidar': 0, 'robotics': 0}

Added postings:

Recruiting & Onboarding Specialist - AI/Life Sciences (f/m/d) - at - AITHYRA GmbH
Frontend Developer (m/f/d) - at - Boehringer Ingelheim RCV GmbH & Co KG
Junior Software Developer (f/m/d) - at - Siemens AG Österreich
Data Platform Engineer (m/w/d) - at - Fronius
IT Systems Engineer - Automation and AI (m/f/d) - at - TTTECH Computertechnik AG
Senior Software Engineer (all genders) - at - twinformatics GmbH
Compliance Expert:in – Schwerpunkt AML / Sanctions Compliance  (w/m/d) - at - Raiffeisen Niederösterreich-Wien
Senior AI Engineer (w/m/d) - at - Fabasoft
Software Engineer C++  Video Products (all genders) - at - Kapsch TrafficCom

#### Option B: reload last data

In [2]:
history = files_io.load_file_if_str(path + "postings_history.json")
current = files_io.load_file_if_str(path + "current_postings.json")
newly_added = files_io.load_file_if_str(path + "newly_added_postings.json")

#Alternatively, if the files are not stored efficiently, but are a time series of "dataframes":
#results = files_io.load_file_if_str(path + files_io.get_filename_from_dir(path=path, index=-1))

In [None]:
newly_added

### Analyze current data

In [3]:
interesting = tech_filter.main(current_postings=current, output_path=None)
#interesting = dict(sorted(interesting.items(), key=lambda x: x[1]['points'], reverse=True))

_, _ = postings_utils.compare_postings(newly_added, [], print_attrs=["title", "company", "keywords", "points", "salary_monthly_guessed", "snippet"], 
                                       printed_text_max_length = 200)

New items above points threshold: 

karriere.at7669694: 
	title: Lead Data Engineer / Lead Data Scientist (w/m/d),
	company: SSI Schäfer Österreich,
	salary_monthly_guessed: 4500,
	keywords: ['data scientist', 'data science', 'algorithm', 'python', 'scientist', 'machine learning', 'brain', 'data engineering', 'management', 'lead', 'data engineer', 'engineer', 'data', 'test', 'matplotlib', 'smart', 'home office', 'design', 'pipeline', 'git', 'dashboard', 'workflow', 'lean', 'stakeholder', 'daten', 'ETL', 'CI/CD', 'ML', 'ELT'],
	points: 4.34,
	snippet: Lead Data Engineer / Lead Data Scientist (w/m/d) SupplyBrain GmbH 2025-11-17 Standort: Graz, AT, 8020 Zielgruppe: Berufserfahrene Arbeitszeitmodell: Vollzeit Beschäftigungsart...

karriere.at7680257: 
	title: Senior AI Engineer (w/m/d),
	company: Helvetia Versicherungen AG,
	salary_monthly_guessed: 4642,
	keywords: ['data science', 'python', 'engineer', 'data', 'docker', 'test', 'agile', 'deutsch', 'senior', 'azure', 'cloud', 'recruit', 'A

### Historical data analysis

In [4]:
print("No. postings so far: ", len(history))
interesting_history = tech_filter.main(current_postings=history, output_path=None)

df_history = pd.DataFrame.from_dict(history, orient='index')
rare_companies = [company for company in df_history['company'].unique() if len(df_history[df_history['company'] == company]) <= 3]

df_rare_companies = df_history[df_history['company'].isin(rare_companies)].sort_values(by='points', ascending=False)[['title', 'company', 'salary',
       'locations', 'url', 'snippet', 'date',
       'id', 'salary_monthly_guessed', 'description', 'points', 'keywords',
       'collected_on', 'role', 'requirements', 'nice_to_have', 'benefits']]

#file_name = 'postings_2025-01-19-00-33-53' #early postings file as reference
#_, removed_all = postings_utils.compare_postings(results, f'{path}{file_name}.json', print_attrs=[])

No. postings so far:  8195


In [None]:
history

In [None]:
interesting_history

In [None]:
rare_companies