In [1]:
import pandas as pd
import numpy as np

from methods import sites, actions, scrape, urls
from methods.macros import *
from schedule import tech_collect

path = f"{RELATIVE_POSTINGS_PATH}/tech/"

#### Option A: gather data from script

In [2]:
info = tech_collect.main()
results = info["results"]

Collecting tech job postings...
Found 663 postings in total

Keyword counts in descriptions:
{'graph data': 2, 'time series': 1, 'operations research': 2, 'graph': 30, 'algorithm': 16, 'sensor': 39, 'spatial': 1, 'geospatial': 0, 'geographical': 0, 'machine learning': 50, 'science': 0, 'analytics': 73, 'radar': 4, 'lidar': 0, 'robotics': 0}

Added postings:

Recruiting & Onboarding Specialist - AI/Life Sciences (f/m/d) - at - AITHYRA GmbH
Frontend Developer (m/f/d) - at - Boehringer Ingelheim RCV GmbH & Co KG
Junior Software Developer (f/m/d) - at - Siemens AG Österreich
Data Platform Engineer (m/w/d) - at - Fronius
IT Systems Engineer - Automation and AI (m/f/d) - at - TTTECH Computertechnik AG
Senior Software Engineer (all genders) - at - twinformatics GmbH
Compliance Expert:in – Schwerpunkt AML / Sanctions Compliance  (w/m/d) - at - Raiffeisen Niederösterreich-Wien
Senior AI Engineer (w/m/d) - at - Fabasoft
Software Engineer C++  Video Products (all genders) - at - Kapsch TrafficCom

#### Option B: reload last data

In [2]:
results = actions.load_file_if_str(path + actions.get_filename_from_dir(path=path, index=-1))

TODO: clean all time postings (update to new dict structure (fill in missing with NaN), recalculate points), or at least sort into "good" and "old" folders

### Analyze current data

In [3]:
interesting = {post:value for post,value in results.items() if any([keyword in results[post]['description'].lower()
                                                                        for keyword in tech_collect.MAIN_DESCRIPTION_KEYWORDS])}
interesting = dict(sorted(interesting.items(), key=lambda x: x[1]['points'], reverse=True))

In [4]:
file_name = actions.get_filename_from_dir(path, index = -2)
added, removed = actions.compare_postings(results, f'{path}{file_name}',
                                           print_attrs=["title", "company", "keywords", "points", "salary_monthly_guessed", "snippet"], printed_text_max_length = 200)

New items above points threshold: 

karriere.at7602260: 
	title: Recruiting & Onboarding Specialist - AI/Life Sciences (f/m/d),
	company: AITHYRA GmbH,
	salary_monthly_guessed: 4642,
	keywords: ['aithyra', 'data scientist', 'scientist', 'head', 'researcher', 'lead', 'robot', 'research', 'acquisition', 'leader', 'engineer', 'data', 'support', 'quality', 'home office', 'design', 'information', 'excel', 'insurance', 'data-driven', 'team leader', 'head of', 'stakeholder', 'communication', 'director', 'health', 'AI', 'ML', 'HR'],
	points: 3.773,
	snippet: About AITHYRA AITHYRA, the new Research Institute for Biomedical Artificial Intelligence in Vienna, Austria, is dedicated to advancing a biomedical revolution by connecting cutting-edge...

karriere.at7604316: 
	title: Frontend Developer (m/f/d),
	company: Boehringer Ingelheim RCV GmbH & Co KG,
	salary_monthly_guessed: 4285,
	keywords: ['graph', 'data scientist', 'control', 'manager', 'scientist', 'python', 'machine learning', 'product', '

In [None]:
results

In [None]:
interesting

In [None]:
added

### Historical data analysis

In [5]:
postings_h = (actions.combine_postings(folder_path=path))
df_h = pd.DataFrame.from_dict(postings_h, orient='index')
print("No. postings so far: ", len(postings_h))

interesting_postings_h = {post:value for post,value in postings_h.items() if any([keyword in postings_h[post]['description'].lower()
                                                                        for keyword in tech_collect.MAIN_DESCRIPTION_KEYWORDS
                                                                        if 'description' in postings_h[post] and postings_h[post]['description']
                                                                        ])}
interesting_postings_h = dict(sorted(interesting_postings_h.items(), key=lambda x: x[1]['points'] if 'points' in x[1] else 0, reverse=True))

rare_companies = [company for company in df_h['company'].unique() if len(df_h[df_h['company'] == company]) <= 3]

df_rare_companies = df_h[df_h['company'].isin(rare_companies)].sort_values(by='points', ascending=False)[['title', 'company', 'salary',
       'locations', 'url', 'snippet', 'date',
       'id', 'salary_monthly_guessed', 'description', 'points', 'keywords',
       'collected_on', 'role', 'requirements', 'nice_to_have', 'benefits']]

file_name = 'postings_2025-01-19-00-33-53'
_, removed_h = actions.compare_postings(results, f'{path}{file_name}.json', print_attrs=[])

No. postings so far:  6332


In [None]:
interesting_postings_h

In [None]:
postings_h

In [None]:
df_rare_companies

In [None]:
removed_h