In [9]:
import requests
import os
import re

from time import sleep
from bs4 import BeautifulSoup

In [10]:
def get_page_jobs(page) -> list:
    """
    Ze strony głównej NoFluffJobs pobiera listę ofer pracy i zwraca list zawieracjącą słownik {job_id, job_url}

    @param page: BeufitifulSoup strony
    @returns: lista ze słownikami {job_id, job_url}
    """
    # znajdujemy oferty pracy dostępne na stronie (przykład znalezienie)
    # poprzez znalezienie obrazka oraz jego rodzica
    jobs = [x.parent for x in page.find_all('div', class_='posting-image')]

    result = []  # zmienna z informacjami wyjściowymi
    for job in jobs:
        id = job['id']  # id oferty
        url = job['href']  # url do oferty

        # dodajemy informacje do zmiennej wynikowej
        result.append({'job_id': id, 'job_url': url})

    return result

In [11]:
def is_empty_page(page) -> bool:
    """
    Sprawdza czy strona jest pusta
    """
    empty_condition = page.find('h2', class_='text-white font-weight-bold')

    if empty_condition is None:
        return False
    else:
        return True

In [12]:
def get_contract_type(type_: str) -> str:
    if type_.find('B2B') > 0:
        return 'B2B'
    elif type_.find('dzieło') > 0:
        return 'UoD'
    elif type_.find('zlecenie') > 0:
        return 'UZ'
    elif type_.find('pracę') > 0:
        return 'UoP'
    return 'N/F'

In [13]:
def parse_salary(salary) -> dict:
    bounds = re.findall('[0-9]+', salary)
    low_bound = bounds[0]
    high_bound = bounds[1] if len(bounds) > 1 else bounds[0]
    currency = salary.split()[-1]

    return {'low': low_bound,
            'high': high_bound,
            'currency': currency}

In [14]:
def get_job_data(bs) -> dict:
    if is_empty_page(bs):
        return

    header_bs = bs.find(id='posting-header')
    job_title = header_bs.find('h1').string
    company_name = header_bs.find('dd').string

    salary_bs = bs.find('nfj-posting-salaries-list')
    salaries = salary_bs.find_all(class_='salary')
    types = [s.find('p').text for s in salaries]

    salaries = [x.find('h4').text for x in salaries]
    salaries = [parse_salary(x) for x in salaries]

    types_cleaned = [get_contract_type(x) for x in types]

    salary_data = zip(types_cleaned, salaries)
    salary_data = dict(salary_data)

    additional_info_bs = bs.find('div', class_='additional-info-row')

    try:
        locations = additional_info_bs.find('li', class_='text-break').text

        locations = locations.replace('(Po pandemii)', '')
        locations = locations.replace('location_on', '')
        
        locations = [x.strip() for x in locations.split(',')]
    except AttributeError:
        locations = ['N/A']
        
    results = {}
    results['job_title'] = job_title.strip()
    results['company'] = company_name.strip()
    
    results['salary'] = salary_data 
    results['locations'] = locations

    return results

In [15]:
offers = []
data_path = '../data/raw'

for entry in os.scandir(data_path):
    print('Processing ' + entry.name)

    with open(os.path.join(data_path, entry.name), encoding='UTF-8') as f:
        html = f.read()

    bs = BeautifulSoup(html)
    offers += get_page_jobs(bs)

Processing data analyst_1_20210317_121632.html
Processing data analyst_2_20210317_121632.html
Processing data engineer_1_20210317_121632.html
Processing data engineer_2_20210317_121632.html
Processing data engineer_3_20210317_121632.html
Processing data engineer_4_20210317_121632.html
Processing data scientist_1_20210317_121632.html


In [16]:
results = []
i = 1
for offer in offers:
    print(i)
    print(offer['job_url'])

    timeout = 0
    while timeout < 10:
        try:
            url = 'https://nofluffjobs.com' + offer['job_url']
            r = requests.get(url)
            x = get_job_data(BeautifulSoup(r.text))
            break
        except AttributeError:
            print('Timeout {} of 10'.format(timeout))
            timeout += 1
            sleep(5)
        if timeout == 10:
            x = None
            break

    if x:
        x['id'] = i
        results.append(x)

    sleep(5)
    i+=1

1
/pl/job/ux-ui-designer-hl-tech-warsaw-g8dhirzb
2
/pl/job/power-bi-data-analyst-devire-warszawa-dkwkjy8e
3
/pl/job/data-analyst-iiit-warszawa-yzunafaq
4
/pl/job/senior-data-analyst-devire-remote-zpig62pp
5
/pl/job/office-manager-prime-force-warsaw-v8b03tip
6
/pl/job/senior-power-bi-developer-lingaro-remote-zkfyy6qe
7
/pl/job/data-analyst-business-intelligence-westwing-robakowo-1aqisyg9
8
/pl/job/technical-product-devops-kyp-ai-remote-fwv28vkw
9
/pl/job/tester-data-analyst-dpdgroup-it-solutions-warszawa-dvrkq1gg
10
/pl/job/business-analyst-ework-group-gdansk-mdac2ic9
11
/pl/job/data-scientist-gfk-polonia-warszawa-ivqru8sq
12
/pl/job/big-data-analyst-solid-bridge-solutions-warsaw-vg3i0mzi
13
/pl/job/senior-python-developer-data-science-astek-polska-remote-sorrjdky
14
/pl/job/data-scientist-tidio-remote-s0vwd0hw
15
/pl/job/business-analyst-prodata-consult-gdansk-wy5irucl
16
/pl/job/business-analyst-qlogix-krakow-2nadvpc5
17
/pl/job/sap-business-training-coordinator-opel-manufacturing-pol

In [17]:
results

ob_title': 'Data Scientist',
  'company': 'Tidio',
  'salary': {'B2B': {'low': '15000', 'high': '22000', 'currency': 'PLN'}},
  'locations': ['Warszawa']},
 {'job_title': 'Business Analyst',
  'company': 'ProData Consult',
  'salary': {'B2B': {'low': '16800', 'high': '18480', 'currency': 'PLN'}},
  'locations': ['Gdańsk']},
 {'job_title': 'Business Analyst',
  'company': 'Qlogix sp. z o. o.',
  'salary': {'B2B': {'low': '9000', 'high': '15000', 'currency': 'PLN'},
   'UoP': {'low': '11000', 'high': '11000', 'currency': 'PLN'}},
  'locations': ['Kraków']},
 {'job_title': 'SAP Business Training Coordinator',
  'company': 'Opel Manufacturing Poland Sp. o.o.',
  'salary': {'UoP': {'low': '6000', 'high': '8000', 'currency': 'PLN'}},
  'locations': ['Gliwice']},
 {'job_title': 'Senior Product Manager',
  'company': 'Precursive Ltd',
  'salary': {'B2B': {'low': '14000', 'high': '16000', 'currency': 'PLN'}},
  'locations': ['N/A']},
 {'job_title': 'Data Scientist',
  'company': 'Samba TV',
  '

In [36]:
x = results[-1]
x

{'job_title': 'Data Scientist (IoT)',
 'company': 'Nexer Group',
 'salary': {'B2B': {'low': '14448', 'high': '21840', 'currency': 'PLN'},
  'UoP': {'low': '12000', 'high': '18000', 'currency': 'PLN'}},
 'locations': ['Wrocław'],
 'salary_types': ['job_title', 'company', 'salary', 'locations']}

In [28]:
import itertools as it
list(it.chain(x['salary'].values, x['locations']))

['B2B', 'UoP', 'Wrocław']

In [38]:
salary_types = list(x['salary'].keys())

In [40]:
x['salary'] 

['B2B', 'UoP']