In [61]:
import pandas as pd
import re
import os

from bs4 import BeautifulSoup

In [62]:
def is_page_empty(bs) -> bool:
    empty_condition = bs.find('h2', class_='text-white font-weight-bold')

    if empty_condition is None:
        return False
    else:
        return True

In [63]:
def parse_salary(salary) -> dict:
    # pomijamy dziwne ogłoszenie firmy Alten
    if salary != '$':
        bounds = re.findall('[0-9]+', salary.replace(' ', ''))

        low_bound = bounds[0]+bounds[1]
        # paskudny hack
        high_bound = bounds[2]+bounds[3] if len(bounds) > 2 else bounds[0]+bounds[1]
        currency = salary.split()[-1]

        return {'low': low_bound,
                'high': high_bound,
                'currency': currency}
    

In [64]:
def parse_city(city) -> dict:
    # pomijamy navalgo z dolarem
    if re.search('(Zdalna)', city) or re.search('\d{2}', city):
         _city = "Zdalna"
         country = 'N/A'
    else:       
        (_city, country) = city.split(',')
        country = country.strip()

    return {'city': _city, 'country': country}

In [65]:
def parse_jobs(jobs) -> list:
    results = []
    
    for job in jobs:
        results.append(parse_job(job))

    return results

In [66]:
def parse_job(job) -> dict:
    job_info = job.find('div', class_='posting-info position-relative d-lg-flex flex-grow-1 align-items-center ng-star-inserted').find_all('span')
    
    salary = job_info[0].text.strip()
    salary_data = parse_salary(salary)
    
    location = job.find('nfj-posting-item-city')
    if location is None:
        location = job_info[1]

    location = location.text.strip()
    location = parse_city(location)

    name = job.find('h3', class_='posting-title__position').text.strip()
    
    company = job.find('span', class_='posting-title__company').text.replace('w', '').strip()

    technology = job.find('a', class_='btn btn-outline-secondary btn-sm text-truncate')
    if technology:
        technology = technology.text.strip()
    else:
        technology = 'N/A'

    return {
        'location': location,
        'salary': salary_data,
        'name': name,
        'company': company,
        'technology': technology
    } 

In [67]:
def get_data(page) -> list:
    results = []

    jobs = [x.parent for x in page.find_all('div', class_='posting-image')]
    page_data = parse_jobs(jobs)
    results += page_data

    return results

In [68]:
data_dir = '../data/raw'
results = []

for entry in os.scandir(data_dir):
    with open(
        os.path.join(data_dir, entry.name),
        encoding='UTF-8') as f:
            html = f.read()

    job = entry.name.split('_')[0]
    bs = BeautifulSoup(html)
    
    offers = get_data(bs)
    for offer in offers:
        offer['job'] = job
    results += offers

In [69]:
offers

[{'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '20000', 'high': '28000', 'currency': 'PLN'},
  'name': 'Senior Data Scientist',
  'company': 'IIIT',
  'technology': 'N/A',
  'job': 'data scientist'},
 {'location': {'city': 'Warszawa', 'country': 'POL'},
  'salary': {'low': '4500', 'high': '6500', 'currency': 'PLN'},
  'name': 'Junior Data Scientist',
  'company': 'Cenatorium',
  'technology': 'N/A',
  'job': 'data scientist'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '13650', 'high': '17850', 'currency': 'PLN'},
  'name': 'Data Scientist (Mid/Senior)',
  'company': 'Onelo S.A.',
  'technology': 'N/A',
  'job': 'data scientist'},
 {'location': {'city': 'Warsaw', 'country': 'POL'},
  'salary': {'low': '22000', 'high': '30000', 'currency': 'PLN'},
  'name': 'Lead Data Scientist',
  'company': 'Adform Sp. z o.o.',
  'technology': 'N/A',
  'job': 'data scientist'},
 {'location': {'city': 'Sopot', 'country': 'POL'},
  'salary': {'low': 

In [70]:
df = pd.json_normalize(results, sep='_')
df.head()

Unnamed: 0,name,company,technology,job,location_city,location_country,salary_low,salary_high,salary_currency,salary
0,Business Analyst/Senior Business Analyst,SoftServe,,data analyst,Zdalna,,11000,21500,PLN,
1,Power BI Developer / Data Analyst,Objectivity,,data analyst,Zdalna,,14000,20000,PLN,
2,Junior Data Analyst,Coinfirm,,data analyst,Zdalna,,3100,5000,PLN,
3,Data Analyst,Packhelp,,data analyst,Warsaw,POL,13000,15000,PLN,
4,Data analyst (analityk danych),Alterdata.io sp. z o.o.,,data analyst,Zdalna,,12000,18000,PLN,


In [71]:
df.to_csv('../data/interim/job_offers.csv', sep=';', encoding='UTF', index=False)

In [72]:
results

[{'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '11000', 'high': '21500', 'currency': 'PLN'},
  'name': 'Business Analyst/Senior Business Analyst',
  'company': 'SoftServe',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '14000', 'high': '20000', 'currency': 'PLN'},
  'name': 'Power BI Developer / Data Analyst',
  'company': 'Objectivity',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '3100', 'high': '5000', 'currency': 'PLN'},
  'name': 'Junior Data Analyst',
  'company': 'Coinfirm',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Warsaw', 'country': 'POL'},
  'salary': {'low': '13000', 'high': '15000', 'currency': 'PLN'},
  'name': 'Data Analyst',
  'company': 'Packhelp',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low'