In [1]:
import pandas as pd
import re
import os

from bs4 import BeautifulSoup

In [2]:
def is_page_empty(bs) -> bool:
    empty_condition = bs.find('h2', class_='text-white font-weight-bold')

    if empty_condition is None:
        return False
    else:
        return True

In [3]:
def parse_salary(salary) -> dict:
    print(salary)
    # pomijamy dziwne ogłoszenie firmy Alten
    if salary != '$':
        bounds = re.findall('[0-9]+', salary.replace(' ', ''))

        low_bound = bounds[0]+bounds[1]
        # paskudny hack
        high_bound = bounds[2]+bounds[3] if len(bounds) > 2 else bounds[0]+bounds[1]
        currency = salary.split()[-1]

        return {'low': low_bound,
                'high': high_bound,
                'currency': currency}
    

In [4]:
def parse_city(city) -> dict:
    # pomijamy navalgo z dolarem
    if re.search('(Zdalna)', city) or re.search('\d{2}', city):
         _city = "Zdalna"
         country = 'N/A'
    else:       
        (_city, country) = city.split(',')
        country = country.strip()

    return {'city': _city, 'country': country}

In [5]:
def parse_jobs(jobs) -> list:
    results = []
    
    for job in jobs:
        results.append(parse_job(job))

    return results

In [6]:
def parse_job(job) -> dict:
    job_info = job.find('div', class_='posting-info position-relative d-lg-flex flex-grow-1 align-items-center ng-star-inserted').find_all('span')
    
    salary = job_info[0].text.strip()
    salary_data = parse_salary(salary)
    
    location = job.find('nfj-posting-item-city')
    if location is None:
        location = job_info[1]

    location = location.text.strip()
    location = parse_city(location)

    name = job.find('h3', class_='posting-title__position').text.strip()
    
    company = job.find('span', class_='posting-title__company').text.replace('w', '').strip()

    technology = job.find('a', class_='btn btn-outline-secondary btn-sm text-truncate')
    if technology:
        technology = technology.text.strip()
    else:
        technology = 'N/A'

    return {
        'location': location,
        'salary': salary_data,
        'name': name,
        'company': company,
        'technology': technology
    } 

In [7]:
def get_data(page) -> list:
    results = []

    jobs = [x.parent for x in page.find_all('div', class_='posting-image')]
    page_data = parse_jobs(jobs)
    results += page_data

    return results

In [8]:
data_dir = '../data/raw'
results = []

for entry in os.scandir(data_dir):
    with open(
        os.path.join(data_dir, entry.name),
        encoding='UTF-8') as f:
            html = f.read()

    job = entry.name.split('_')[0]
    bs = BeautifulSoup(html)
    offers = get_data(bs)
    for offer in offers:
        offer['job'] = job
    results += offers

12 000  - 18 000  PLN
12 000  - 18 000  PLN
11 000  - 21 500  PLN
14 000  - 20 000  PLN
3 100  - 5 000  PLN
13 000  - 15 000  PLN
12 000  - 18 000  PLN
5 500  - 8 300  PLN
8 500  - 12 000  PLN
21 000  - 25 095  PLN
18 480  - 21 840  PLN
10 080  - 18 970  PLN
16 149  - 24 234  PLN
14 000  - 21 000  PLN
13 650  - 23 100  PLN
20 000  - 22 340  PLN
18 000  - 24 000  PLN
10 000  - 16 000  PLN
5 000  - 7 000  PLN
13 440  - 25 200  PLN
7 000  - 12 000  PLN
6 500  - 8 500  PLN
10 000  - 20 000  PLN
$
5 000  - 7 600  PLN
15 120  - 21 840  PLN
16 800  - 24 500  PLN
8 799  - 17 598  PLN
13 650  - 17 850  PLN
16 773  - 23 962  PLN
10 100  - 16 900  PLN
15 000  - 22 000  PLN
18 000  - 21 000  PLN
19 169  - 31 150  PLN
17 000  - 32 000  PLN
29 400  - 32 550  PLN
16 800  - 25 200  PLN
26 880  - 30 240  PLN
14 000  - 22 000  PLN
28 000  - 38 000  PLN
21 000  - 26 250  PLN
46 200  PLN
10 923  - 21 847  PLN
20 000  - 28 000  PLN
18 000  - 33 000  PLN
20 160  - 30 240  PLN
20 160  - 30 240  PLN
18 480  -

In [9]:
offers

[{'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '8000', 'high': '15000', 'currency': 'PLN'},
  'name': 'Software Tester',
  'company': 'RST Softare Masters',
  'technology': 'N/A',
  'job': 'test automation'},
 {'location': {'city': 'Gdańsk', 'country': 'POL'},
  'salary': {'low': '8000', 'high': '10000', 'currency': 'PLN'},
  'name': 'Junior Test Automation Engineer',
  'company': 'Plenti',
  'technology': 'N/A',
  'job': 'test automation'},
 {'location': {'city': 'Wrocław', 'country': 'POL\n + 1'},
  'salary': {'low': '13440', 'high': '19320', 'currency': 'PLN'},
  'name': 'Test Automation Engineer (Python)',
  'company': 'Siili',
  'technology': 'N/A',
  'job': 'test automation'},
 {'location': {'city': 'Warsaw', 'country': 'POL'},
  'salary': {'low': '9000', 'high': '12500', 'currency': 'PLN'},
  'name': 'Automation Test Analyst (Java)',
  'company': 'HL Tech',
  'technology': 'N/A',
  'job': 'test automation'},
 {'location': {'city': 'Warsaw', 'country': 'P

In [10]:
df = pd.json_normalize(results)
df.head()

Unnamed: 0,name,company,technology,job,location.city,location.country,salary.low,salary.high,salary.currency,salary
0,Product Data Analyst,No Fluff Jobs,,data analyst,Zdalna,,12000,18000,PLN,
1,Data Analyst (Private Equity),Devire,,data analyst,Warsaw,POL,12000,18000,PLN,
2,Business Analyst/Senior Business Analyst,SoftServe,,data analyst,Zdalna,,11000,21500,PLN,
3,Power BI Developer / Data Analyst,Objectivity,,data analyst,Zdalna,,14000,20000,PLN,
4,Junior Data Analyst,Coinfirm,,data analyst,Zdalna,,3100,5000,PLN,


In [11]:
df.to_csv('../data/interim/job_offers.csv', sep=';', encoding='UTF', index=False)

In [12]:
results

[{'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '12000', 'high': '18000', 'currency': 'PLN'},
  'name': 'Product Data Analyst',
  'company': 'No Fluff Jobs',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Warsaw', 'country': 'POL'},
  'salary': {'low': '12000', 'high': '18000', 'currency': 'PLN'},
  'name': 'Data Analyst (Private Equity)',
  'company': 'Devire',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '11000', 'high': '21500', 'currency': 'PLN'},
  'name': 'Business Analyst/Senior Business Analyst',
  'company': 'SoftServe',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Zdalna', 'country': 'N/A'},
  'salary': {'low': '14000', 'high': '20000', 'currency': 'PLN'},
  'name': 'Power BI Developer / Data Analyst',
  'company': 'Objectivity',
  'technology': 'N/A',
  'job': 'data analyst'},
 {'location': {'city': 'Zdalna', 'country': 'N/