In [6]:
import requests
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome
import os.path
import glob
import time
import pandas as pd
from datetime import datetime

In [9]:
class Paths:
    notebook = os.path.dirname(os.path.realpath('webscraping_1'))
    proj = os.path.dirname(notebook)
    driver = os.path.join(proj, 'drivers', 'chromedriver')
    data_raw = os.path.join(proj, 'data', 'raw')
    data_interim = os.path.join(proj, 'data', 'interim')
    data_processed = os.path.join(proj, 'data', 'processed')

### Webscraping functions

In [10]:
def get_html_code(job_name, page_num):
    browser = Chrome(Paths.driver)
    browser.get("view-source:" + f'https://nofluffjobs.com/pl/jobs/?criteria=keyword%3D"{job_name}"&page={page_num}')
    html = browser.find_element_by_tag_name('body').text
    
    return html

In [2]:
def any_offers_left(html_code):
    soup = BeautifulSoup(html_code, 'html.parser')
        
    main_container = soup.select_one('nfj-main-content')
    postings_container = main_container.select_one('nfj-postings-list')
    list_container = postings_container.select_one('div.list-container')
    
    if list_container == None:
        return False
    else:
        return True

In [14]:
def save_html_codes(job_name):
    i = 1
    html = get_html_code(job_name, i)
    
    while True:
        time.sleep(3)
        html = get_html_code(job_name, i)
        
        if not any_offers_left(html):
            break
        
        completeName = os.path.join(Paths.data_raw, f"{job_name}_{i}.html")
        
        with open(completeName, 'w', encoding="utf-8") as my_file:
            my_file.write(html)
        i+=1

In [36]:
def get_offer_info(html):
    html_offers_list = []
    
    with open(html, encoding='utf-8') as html:
        soup = BeautifulSoup(html, 'html.parser')
    
    main_container = soup.select_one('nfj-main-content')
    postings_container = main_container.select_one('nfj-postings-list')
    list_container = postings_container.select_one('div.list-container')
    items_container = list_container.select('a.posting-list-item')
    job = soup.select_one('nfj-search-box')
    
    for elem in range(len(items_container)):
        offer = []
        offer.append(items_container[elem]) # kod html zawierajacy info o ofercie
        offer.append(job)                   # kod html zawierajacy info o hasle wyszukiwania
        html_offers_list.append(offer)
    
    return html_offers_list

In [138]:
def offer_data_dict(offer_code, offer_code_job):

    name = offer_code.select_one('h3.posting-title__position').text.strip()
    company = offer_code.select_one('span.d-block').text.strip().replace('@ ', '')
    technology = offer_code.select_one('nfj-posting-item-tags').text.split('PLN')[1].strip()
    job = offer_code_job.select_one('mat-chip.mat-chip').text.strip()
    localization = offer_code.select_one('nfj-posting-item-city')
    if localization == None:
        city = 'Zdalna'
        country = 'N/A'
    else:
        localization = localization.text.strip()
        city = localization.split(', ')[0].strip()
        country = localization.split(', ')[1].strip()
    
    salary = offer_code.select_one('span.text-truncate.badgy.salary')
    salary_formatted = salary.text.strip().split('-')
    try:
        low = str(salary_formatted[0].strip()[:-4]) + " 000"
    except IndexError:
        low = None
    try:
        high = str(salary_formatted[1][:-3].strip()[:-4]) + " 000"
    except IndexError:
        high = None
    try:
        currency = salary_formatted[1][-3:].strip()
    except IndexError:
        currency = None
    
    return {
        'name': name,
        'company': company,
        'technology': technology,
        'job': job,
        'location': {'city': city, 'country': country},
        'salary': {'low': low, 'high': high, 'currency': currency} 
        }

### Executing the code

In [140]:
job_names = [
                'data analyst',
                'data scientist',
                'data engineer'
            ]

for job in job_names:
    save_html_codes(job)

files_list = glob.glob(fr'{Paths.data_raw}\**.html')
offers_in_html = []
offers_dict_list = []
for file in files_list:
    offers_in_html.append(get_offer_info(file))

for i, elem in enumerate(offers_in_html):
    for k, element in enumerate(offers_in_html[i]):
        offers_dict_list.append(offer_data_dict(element[0], element[1]))

  browser = Chrome(Paths.driver)
  html = browser.find_element_by_tag_name('body').text


### Saving the data as a .csv file

In [141]:
df = pd.json_normalize(offers_dict_list, sep='_')
df

Unnamed: 0,name,company,technology,job,location_city,location_country,salary_low,salary_high,salary_currency
0,Data Analyst,FLYR Poland sp.z.o.o,SQL,data analyst,Kraków,POL,11 000,16 000,PLN
1,Junior Data Analyst,Coinfirm,mysql,data analyst,Zdalna,,3 000,5 000,PLN
2,Junior Technical Product Manager,Scalaric,,data analyst,Kraków,POL,5 000,10 000,PLN
3,Business System Analyst (Data&Analytics),Elitmind,,data analyst,Zdalna,,15 000,21 000,PLN
4,Data analyst (analityk danych),Alterdata.io sp. z o.o.,,data analyst,Zdalna,,12 000,21 000,PLN
...,...,...,...,...,...,...,...,...,...
141,Data Scientist,Simon - Kucher & Partners,,data scientist,Zdalna,,13 000,15 000,PLN
142,Senior Data Scientist,Relayr,python,data scientist,Katowice,POL\n + 4,21 000,24 000,PLN
143,Senior Data Scientist,hubQuest,python,data scientist,Zdalna,,25 000,34 000,PLN
144,Data Scientist,Avanade Poland,python,data scientist,Warszawa,POL\n + 3,15 000,25 000,PLN


In [142]:
df.to_csv(
         fr'{Paths.data_interim}\job_offers_{datetime.today().strftime("%d-%m-%Y")}.csv',
        sep=';',
        encoding='utf-8',
        index=False
)



In [145]:
df_1 = pd.read_csv(
    fr'{Paths.data_interim}\job_offers_{datetime.today().strftime("%d-%m-%Y")}.csv',
    sep=';',
    encoding='utf-8',
)
df_1

Unnamed: 0,name,company,technology,job,location_city,location_country,salary_low,salary_high,salary_currency
0,Data Analyst,FLYR Poland sp.z.o.o,SQL,data analyst,Kraków,POL,11 000,16 000,PLN
1,Junior Data Analyst,Coinfirm,mysql,data analyst,Zdalna,,3 000,5 000,PLN
2,Junior Technical Product Manager,Scalaric,,data analyst,Kraków,POL,5 000,10 000,PLN
3,Business System Analyst (Data&Analytics),Elitmind,,data analyst,Zdalna,,15 000,21 000,PLN
4,Data analyst (analityk danych),Alterdata.io sp. z o.o.,,data analyst,Zdalna,,12 000,21 000,PLN
...,...,...,...,...,...,...,...,...,...
141,Data Scientist,Simon - Kucher & Partners,,data scientist,Zdalna,,13 000,15 000,PLN
142,Senior Data Scientist,Relayr,python,data scientist,Katowice,POL\n + 4,21 000,24 000,PLN
143,Senior Data Scientist,hubQuest,python,data scientist,Zdalna,,25 000,34 000,PLN
144,Data Scientist,Avanade Poland,python,data scientist,Warszawa,POL\n + 3,15 000,25 000,PLN
