In [1]:
import pandas as pd
import re
import os

from bs4 import BeautifulSoup

In [2]:
def is_page_empty(bs) -> bool:
    empty_condition = bs.find('h2', class_='text-white font-weight-bold')

    if empty_condition is None:
        return False
    else:
        return True

In [3]:
def parse_salary(salary) -> dict:
    bounds = re.findall('[0-9]+', salary.replace(' ', ''))
    low_bound = bounds[0]
    high_bound = bounds[1] if len(bounds) > 1 else bounds[0]
    currency = salary.split()[-1]

    return {'low': low_bound,
            'high': high_bound,
            'currency': currency}

In [4]:
def parse_city(city) -> dict:
    if re.search('(Zdalna)', city):
         _city = "Zdalna"
         country = 'N/A'
    else:       
        (_city, country) = city.split(',')
        country = country.strip()

    return {'city': _city, 'country': country}

In [5]:
def parse_jobs(jobs) -> list:
    results = []
    
    for job in jobs:
        results.append(parse_job(job))

    return results

In [6]:
def parse_job(job) -> dict:
    job_info = job.find('div', class_='posting-info position-relative d-none d-lg-flex flex-grow-1').find_all('span')
    
    salary = job_info[0].text.strip()
    salary_data = parse_salary(salary)
    
    location = job.find('nfj-posting-item-city')
    if location is None:
        location = job_info[1]

    location = location.text.strip()
    location = parse_city(location)

    name = job.find('h2', class_='posting-title__position').text.strip()
    
    company = job.find('span', class_='posting-title__company').text.replace('w', '').strip()

    technology = job.find('a', class_='btn btn-outline-secondary btn-sm text-truncate')
    if technology:
        technology = technology.text.strip()
    else:
        technology = 'N/A'

    return {
        'location': location,
        'salary': salary_data,
        'name': name,
        'company': company,
        'technology': technology
    } 

In [7]:
def get_data(page) -> list:
    results = []

    jobs = [x.parent for x in page.find_all('div', class_='posting-image')]
    page_data = parse_jobs(jobs)
    results += page_data

    return results

In [8]:
data_dir = '../data/raw'
results = []

for entry in os.scandir(data_dir):
    with open(
        os.path.join(data_dir, entry.name),
        encoding='UTF-8') as f:
            html = f.read()

    job = entry.name.split('_')[0]
    bs = BeautifulSoup(html)
    
    offers = get_data(bs)
    for offer in offers:
        offer['job'] = job
    results += offers

In [9]:
offers

[{'location': {'city': 'Warsaw', 'country': 'POL'},
  'salary': {'low': '5000', 'high': '10000', 'currency': 'PLN'},
  'name': 'Data Scientist',
  'company': 'Devire',
  'technology': 'N/A',
  'job': 'data scientist'},
 {'location': {'city': 'Warsaw', 'country': 'POL'},
  'salary': {'low': '10000', 'high': '13000', 'currency': 'PLN'},
  'name': 'Data Scientist',
  'company': 'NatWest Group',
  'technology': 'N/A',
  'job': 'data scientist'},
 {'location': {'city': 'Warszawa', 'country': 'POL'},
  'salary': {'low': '9000', 'high': '18000', 'currency': 'PLN'},
  'name': 'Data Scientist',
  'company': 'Time S.A.',
  'technology': 'N/A',
  'job': 'data scientist'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '11760', 'high': '19320', 'currency': 'PLN'},
  'name': 'Data Scientist',
  'company': '7N Sp. z o.o.',
  'technology': 'N/A',
  'job': 'data scientist'},
 {'location': {'city': 'Wrocław', 'country': 'POL'},
  'salary': {'low': '10000', 'high': '14000', 'cur

In [10]:
df = pd.json_normalize(results, sep='_')
df.head()

Unnamed: 0,name,company,technology,job,location_city,location_country,salary_low,salary_high,salary_currency
0,Data Analyst,AVENGA,,data analyst,Zdalna,,15120,18480,PLN
1,Python Engineer,ASTEK Polska,,data analyst,Zdalna,,17430,20160,PLN
2,Data Scientist (senior),PRUFTECHNIK Technology,,data analyst,Wrocław,POL,15540,21460,PLN
3,Data Analyst,Siepomaga.pl,,data analyst,Poznań,POL,7000,14000,PLN
4,Data Engineer,Brytlyt,,data analyst,Zdalna,,10000,18000,PLN


In [11]:
df.to_csv('../data/interim/job_offers.csv', sep=';', encoding='UTF', index=False)

In [12]:
results

[{'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '15120', 'high': '18480', 'currency': 'PLN'},
  'name': 'Data Analyst',
  'company': 'AVENGA',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '17430', 'high': '20160', 'currency': 'PLN'},
  'name': 'Python Engineer',
  'company': 'ASTEK Polska',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Wrocław', 'country': 'POL'},
  'salary': {'low': '15540', 'high': '21460', 'currency': 'PLN'},
  'name': 'Data Scientist (senior)',
  'company': 'PRUFTECHNIK Technology',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Poznań', 'country': 'POL'},
  'salary': {'low': '7000', 'high': '14000', 'currency': 'PLN'},
  'name': 'Data Analyst',
  'company': 'Siepomaga.pl',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '10000', 'high': '1800