<a href="https://colab.research.google.com/github/pevu97/Jobb-Offer-Classifier/blob/main/notebook/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
!pip install beautifulsoup4

def get_job_details(job_id):
  job_id = job_id
  url = f"https://nofluffjobs.com/api/posting/{job_id}"

  headers = {
      "User-Agent": "Mozilla/5.0",
      "Accept": "application/json"
  }

  response = requests.get(url, headers=headers)

  return response.json()


def get_job_list(sentence):
  url = "https://nofluffjobs.com/api/search/posting"
  params = {
      "pageFrom": 1,
      "pageTo": 1,
      "pageSize": 20,
      "salaryCurrency": "PLN",
      "salaryPeriod": "month",
      "region": "pl",
      "language": "pl-PL",
      "withSalaryMatch": "true"
  }

  headers = {
      "Content-Type": "application/infiniteSearch+json",
      "User-Agent": "Mozilla/5.0"
  }

  payload = {
      "rawSearch": f"requirement={sentence}"
  }

  response = requests.post(url, headers=headers, params=params, json=payload)

  return response.json()




In [2]:
offer_list = get_job_list("python")
type(offer_list)

dict

In [3]:
offer_list['postings'][215]['id']

'full-stack-engineer-python-pandas-azure-microservices-square-one-resources-west-pomeranian'

In [4]:
jobs_titles = []
for offer in offer_list['postings']:
  jobs_titles.append(offer['id'])

In [5]:
job_details = get_job_details(jobs_titles[0])

In [6]:
keywords = [
    "python", "java", "javascript", "react", "django", "sql", "node", "devops", "qa",
    "junior", "mid", "entry", "trainee", "intern"
]


title_list = []
for key in keywords:
  offer_list = get_job_list(key)['postings']
  for offer in offer_list:
    title_list.append(offer['id'])
len(title_list)

2039

In [7]:
title_list[1]

'senior-python-developer-astek-polska-lower-silesian'

In [8]:
job_details = []

for title in title_list:
  job_details.append(get_job_details(title))


In [9]:
len(job_details)

2039

In [10]:
from collections import Counter

labels = [offer['basics']['seniority'][0].lower() for offer in job_details if offer.get('basics', {}).get('seniority')]
print(Counter(labels))

Counter({'senior': 932, 'mid': 769, 'junior': 160, 'trainee': 140, 'expert': 38})


In [11]:
jobs = {'senior':[], 'mid':[], 'junior':[]}

In [12]:
for offer in job_details:
  if offer.get('basics', {}).get('seniority'):
    if 'senior' in offer['basics']['seniority'][0].lower():
      jobs['senior'].append(offer)
    if 'mid' in offer['basics']['seniority'][0].lower():
      jobs['mid'].append(offer)
    if 'junior' in offer['basics']['seniority'][0].lower():
      jobs['junior'].append(offer)
    if 'trainee' in offer['basics']['seniority'][0].lower():
      jobs['junior'].append(offer)
    if 'expert' in offer['basics']['seniority'][0].lower():
      jobs['senior'].append(offer)

In [13]:
len(jobs['senior'])

970

In [14]:
type(jobs['senior'][0])

dict

In [15]:
jobs['senior'][0]['title']

'Senior Python Developer'

In [16]:

from bs4 import BeautifulSoup


text = jobs['senior'][0]['requirements']['description']

soup = BeautifulSoup(text, 'html.parser')
clean_text = soup.get_text()
print(clean_text)


Minimum 8 lat doświadczenia w programowaniu backendowym w Pythonie, biegłość w frameworkach Django, Flask lub FastAPI oraz praktyczne stosowanie zasad SOLID i DRY
Doświadczenie w projektowaniu i utrzymaniu złożonych systemów opartych na mikroserwisach, REST API oraz pracy w środowiskach chmurowych (AWS, GCP, Azure)
Znajomość platform do orkiestracji workflowów (np. Temporal) oraz umiejętność integracji z katalogami usług
Praktyczna znajomość CI/CD, narzędzi Infrastructure as Code (Terraform, Ansible) oraz programowania asynchronicznego w Pythonie
Dobra znajomość baz danych relacyjnych (PostgreSQL, MySQL) i NoSQL (MongoDB, DynamoDB) wraz z umiejętnością projektowania schematów
Silne zdolności analityczne, umiejętność rozwiązywania problemów technicznych, skuteczna komunikacja oraz doświadczenie w pracy zespołowej

Mile widziane:

Znajomość Docker, Kubernetes, języków Go, Rust lub Java
Doświadczenie z architekturą zdarzeniową, systemami kolejkowymi (Kafka, RabbitMQ) oraz serverless (AWS

In [17]:
jobs['senior'][0]['requirements']['musts'][0]

{'value': 'Python', 'type': 'main'}

In [18]:
for key in jobs['senior'][0]['requirements']['musts'][0]:
  print(jobs['senior'][0]['requirements']['musts'][0][key])

Python
main


In [19]:
new_offers = {'senior':['senior'], 'mid':['mid' ], 'junior':[ 'junior']}

In [20]:
for offer in jobs['senior']:
  if offer['requirements']['description']:
    soup = BeautifulSoup(offer['requirements']['description'], 'html.parser')
    clean_text = soup.get_text()


In [22]:
from bs4 import BeautifulSoup
import pandas as pd

def clean_html(html_str):
    return BeautifulSoup(html_str or "", "html.parser").get_text(separator=" ")

dataset = []

for label, offers in jobs.items():
    for offer in offers:
        title = offer.get("title", "")

        desc_html = offer.get("details", {}).get("description", "")
        req_html = offer.get("requirements", {}).get("description", "")

        musts = offer.get("requirements", {}).get("musts", [])
        nices = offer.get("requirements", {}).get("nices", [])

        must_skills = " ".join([m.get("value", "") for m in musts])
        nice_skills = " ".join([n.get("value", "") for n in nices])

        desc_text = clean_html(desc_html)
        req_text = clean_html(req_html)

        full_text = f"{title}\n{desc_text}\n{req_text}\nMust have: {must_skills}\nNice to have: {nice_skills}"

        dataset.append({
            "text": full_text,
            "label": label
        })

# Konwersja do DataFrame i zapis
df = pd.DataFrame(dataset)
df.to_csv("job_offers_dataset.csv", index=False)
print("✅ Zapisano plik job_offers_dataset.csv")


✅ Zapisano plik job_offers_dataset.csv


In [23]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
senior,970
mid,769
junior,300


In [26]:
min_count = 300

In [27]:
senior = df[df['label'] == 'senior'].sample(n=min_count, random_state=42)
mid = df[df['label'] == 'mid'].sample(n=min_count, random_state=42)
junior = df[df['label'] == 'junior'].sample(n=min_count, random_state=42)

In [28]:
new_df = pd.concat([senior, mid, junior], ignore_index=True)


In [29]:
new_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
senior,300
mid,300
junior,300


In [31]:
X = new_df['text'].values

In [34]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y = le.fit_transform(new_df['label'])


In [36]:
le.classes_

array(['junior', 'mid', 'senior'], dtype=object)

In [37]:
X[0]

"AI/ML Principal Software Engineer\nJoin our client's team in the medical industry as an AI/ML Principal Software Engineer, and help build cutting-edge software solutions powered by machine learning for real-world applications. \n In this role, you will play a key part in designing and building scalable, high-performance systems that bring machine learning models into production environments. This is an exciting opportunity to lead critical initiatives at the intersection of software engineering and AI, while working remotely within a collaborative and forward-thinking team.\n\n Minimum of 7 years of experience building and deploying complex, production-grade software systems \n Proficient in backend technologies (e.g., Python, Java, Node.js) and frameworks (e.g., Django, Flask, Spring Boot) \n Skilled in frontend frameworks such as React or Angular \n Strong background in containerization (Docker) and orchestration (Kubernetes) \n Proven expertise with AWS and scalable cloud-based arc