In [428]:
import re
import glob
import pandas as pd

from bs4 import BeautifulSoup as bs
from datetime import date

today = date.today().strftime('%d_%m_%Y')

def find_files(position: str) :
    files = glob.glob(f'./../data/raw/{position}_*.html')
    return files

def read_file(path: str):
    with open(path, 'r') as file:
        soup = bs(file)
        html = soup.find_all(class_="list-container ng-star-inserted")
        jobs = [container.find_all(class_="posting-list-item") for container in html]
    return jobs

def parse_location(location):
    location_dict = {}
    if location is not None:
        location = location.text.replace('\n', "").strip().split(',')
        if len(location) > 1:
            location_dict['city'], location_dict['country'] = location[0], location[1].strip().split(' ')[0]
    else:
        location_dict['city'], location_dict['country'] = "Zdalna", "N/A"
    return location_dict

def parse_salary(salary):
    salary_dict = {}
    
    regex = '\d+\s\d+'
    bounds = re.findall(regex, salary)
    bounds = [int(b.replace(u'\xa0', "")) for b in bounds]
    
    if bounds:
        if len(bounds) == 2:
            if bounds[0] < bounds[1]:
                salary_dict['low'], salary_dict['high'] = bounds[0], bounds[1]
            else:
                salary_dict['low'], salary_dict['high'] = bounds[1], bounds[0]
        elif len(bounds) == 1:
            salary_dict['low'], salary_dict['high'] = bounds[0], bounds[0]
    else:
        salary_dict['low'], salary_dict['high'] = 'N/A', 'N/A'
            
    regex = '[a-zA-Z]+'
    currency = re.findall(regex, salary)
    if currency:
        salary_dict['currency'] = currency[0]
    else:
        salary_dict['currency'] = "N/A"
        
    return salary_dict

def parse_technology(technology):
    if technology is not None:
        return technology.text.strip()
    else:
        return "N/A"

def generate_dictionaries(jobs, position):
    job_offers = []
    for job in jobs:
        name = job.find(class_="posting-title__position").text.strip()
        company = job.find(class_='posting-title__company').text.strip()
        salary = parse_salary(job.find(angularticscategory="engagement").text)
        location = parse_location(job.find('nfj-posting-item-city'))
        technology = parse_technology(job.find('common-posting-item-tag'))
        job_details = {
            'name': name,
            'company': company,
            'technology': technology,
            'job': position, 
            'location': location,
            'salary': salary,
        }
        job_offers.append(job_details)
    return job_offers

def get_dataframe(job_offers, position):
    return pd.json_normalize(job_offers)
    
def save_to_csv(df_list, position):
    df = pd.concat(df_list)
    df.to_csv(f'./../data/interim/{position}_{today}.csv', encoding='utf8', index=False)
    print('Zapisano dane do plików .csv')
    
def parse_len(files):
    if len(files) == 1:
        return '1 plik'
    elif 1 < len(files) < 5:
        return f'{len(files)} pliki'
    else:
        return f"{len(files)} plików"

In [429]:
def main(position):
    files = find_files(position)
    
    print(f"Znaleziono {parse_len(files)} z ofertami pracy dla {' '.join(position.split('_'))}")
    df_list = []
    jobs_count = 0

    for file in files:
        containers = read_file(file)
        for jobs in containers:
            job_offers = generate_dictionaries(jobs, position)
            df = get_dataframe(job_offers, position)
            df_list.append(df)
            jobs_count += len(jobs)
    print(f"Pobrano {jobs_count} ogłoszeń")
    
    save_to_csv(df_list, position)

In [433]:
main('python_developer')

Znaleziono 5 plików z ofertami pracy dla python developer
Pobrano 100 ogłoszeń
Zapisano dane do plików .csv
