In [1]:
import pandas as pd
import re
import os

from bs4 import BeautifulSoup

In [2]:
def is_page_empty(bs) -> bool:
    empty_condition = bs.find('h2', class_='text-white font-weight-bold')

    if empty_condition is None:
        return False
    else:
        return True

In [3]:
def parse_salary(salary) -> dict:
    # pomijamy dziwne ogłoszenie firmy Alten
    if salary != '$':
        bounds = re.findall('[0-9]+', salary.replace(' ', ''))

        low_bound = bounds[0]+bounds[1]
        # paskudny hack
        high_bound = bounds[2]+bounds[3] if len(bounds) > 2 else bounds[0]+bounds[1]
        currency = salary.split()[-1]

        return {'low': low_bound,
                'high': high_bound,
                'currency': currency}
    

In [4]:
def parse_city(city) -> dict:
    # pomijamy navalgo z dolarem
    if re.search('(Zdalna)', city) or re.search('\d{2}', city):
         _city = "Zdalna"
         country = 'N/A'
    else:       
        (_city, country) = city.split(',')
        country = country.strip()

    return {'city': _city, 'country': country}

In [5]:
def parse_jobs(jobs) -> list:
    results = []
    
    for job in jobs:
        results.append(parse_job(job))

    return results

In [6]:
def parse_job(job) -> dict:
    job_info = job.find('div', class_='posting-info position-relative d-lg-flex flex-grow-1 align-items-center ng-star-inserted').find_all('span')
    
    salary = job_info[0].text.strip()
    salary_data = parse_salary(salary)
    
    location = job.find('nfj-posting-item-city')
    if location is None:
        location = job_info[1]

    location = location.text.strip()
    location = parse_city(location)

    name = job.find('h3', class_='posting-title__position').text.strip()
    
    company = job.find('span', class_='posting-title__company').text.replace('w', '').strip()

    technology = job.find('a', class_='btn btn-outline-secondary btn-sm text-truncate')
    if technology:
        technology = technology.text.strip()
    else:
        technology = 'N/A'

    return {
        'location': location,
        'salary': salary_data,
        'name': name,
        'company': company,
        'technology': technology
    } 

In [7]:
def get_data(page) -> list:
    results = []

    jobs = [x.parent for x in page.find_all('div', class_='posting-image')]
    page_data = parse_jobs(jobs)
    results += page_data

    return results

In [8]:
data_dir = '../data/raw'
results = []

for entry in os.scandir(data_dir):
    with open(
        os.path.join(data_dir, entry.name),
        encoding='UTF-8') as f:
            html = f.read()

    job = entry.name.split('_')[0]
    bs = BeautifulSoup(html)
    offers = get_data(bs)
    for offer in offers:
        offer['job'] = job
    results += offers

In [9]:
offers

[{'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '12000', 'high': '17000', 'currency': 'PLN'},
  'name': 'Remote Java Developer',
  'company': 'EXATEL S.A.',
  'technology': 'N/A',
  'job': 'java developer'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '11760', 'high': '20160', 'currency': 'PLN'},
  'name': 'Java Developer (Mid/Senior)',
  'company': 'BCF Softare Sp. z o. o.',
  'technology': 'N/A',
  'job': 'java developer'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '12600', 'high': '18480', 'currency': 'PLN'},
  'name': 'Java Developer',
  'company': 'Clurgo Sp. z o.o.',
  'technology': 'N/A',
  'job': 'java developer'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '14280', 'high': '19320', 'currency': 'PLN'},
  'name': 'Java Developer',
  'company': 'Clurgo Sp. z o.o.',
  'technology': 'N/A',
  'job': 'java developer'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  's

In [10]:
df = pd.json_normalize(results)
df.head()

Unnamed: 0,name,company,technology,job,location.city,location.country,salary.low,salary.high,salary.currency,salary
0,Full Stack Java Developer,Sky and Space Co,,java developer,Zdalna,,16000,18000,PLN,
1,Java Developer,Contina Sp. z o.o.,,java developer,Zdalna,,7000,12000,PLN,
2,Java Developer (with AWS),ASTEK Polska,,java developer,Zdalna,,21000,28350,PLN,
3,Junior Java Developer,Softarely,,java developer,Warszawa,POL,14700,18480,PLN,
4,Junior Java/Groovy Developer,ProData Consult,,java developer,Warszawa,POL,10080,16800,PLN,


In [11]:
df.to_csv('../data/interim/job_offers.csv', sep=';', encoding='UTF', index=False)

In [12]:
results

[{'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '16000', 'high': '18000', 'currency': 'PLN'},
  'name': 'Full Stack Java Developer',
  'company': 'Sky and Space Co',
  'technology': 'N/A',
  'job': 'java developer'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '7000', 'high': '12000', 'currency': 'PLN'},
  'name': 'Java Developer',
  'company': 'Contina Sp. z o.o.',
  'technology': 'N/A',
  'job': 'java developer'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '21000', 'high': '28350', 'currency': 'PLN'},
  'name': 'Java Developer (with AWS)',
  'company': 'ASTEK Polska',
  'technology': 'N/A',
  'job': 'java developer'},
 {'location': {'city': 'Warszawa', 'country': 'POL'},
  'salary': {'low': '14700', 'high': '18480', 'currency': 'PLN'},
  'name': 'Junior Java Developer',
  'company': 'Softarely',
  'technology': 'N/A',
  'job': 'java developer'},
 {'location': {'city': 'Warszawa', 'country': 'POL'},
  'sa