### Imports

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
# from concurrent.futures import ThreadPoolExecutor
from tqdm.contrib.concurrent import thread_map
pd.set_option('display.max_colwidth', None)

  from .autonotebook import tqdm as notebook_tqdm


### GET AIRLINES URLS ###

In [2]:
url = 'https://www.pilotjobsnetwork.com/'
max_urls = 5

def get_airlines_urls(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')
    for link in links:
        href = link.get('href')
        if href and 'jobs/' in href:
            airline_url = url + href
            airline_name = link.text.strip()
            yield airline_url, airline_name # Generator creation

### GET INFO FROM PAGES ###

In [3]:
def get_airline_tables(airline_tuple):
    airline_url, airline_name = airline_tuple
    try:
        list_tables = pd.read_html(airline_url)
        salary = list_tables[3].iloc[2:4,1].tolist()
        # iloc iloc[2:4,2] gets you the dates, but remove the salary ... smthing to 
        last_update = list_tables[3].iloc[2:4,2].tolist()
        return [airline_url, airline_name] + salary + last_update
    except Exception as e:
        print(e)
        try:
            return list_tables[3]
        except NameError:
            return airline_url

### MAP IT !! ###

In [4]:
list_salaries = list(thread_map(get_airline_tables, get_airlines_urls(url)))
# print dans un excel le list_salaries

106it [00:09, 20.86it/s]

'ascii' codec can't encode characters in position 18-19: ordinal not in range(128)


226it [00:16, 20.65it/s]



501it [00:34, 14.65it/s]


### DF AS OUTPUT ###

In [5]:
df_salary = pd.DataFrame([salary for salary in list_salaries if type(salary) == list])
df_salary

Unnamed: 0,0,1,2,3,4,5
0,https://www.pilotjobsnetwork.com/jobs/British_Airways_Mainline,10Mar - UK (Le) - British Airways Mainline,"Short Haul Â£166,517 Long Haul Â£196,438","Short Haul & Long Haul Â£87,876 Based on year 1 on pay scale. Time to command: Short Haul: 1yr Long Haul: 18yrs",9/Mar/23,9/Mar/23
1,https://www.pilotjobsnetwork.com/jobs/Cargo_Air,10Mar - Bulgaria (Ca) - Cargo Air,Basic salary: 4000 EUR/month (after taxes). Per/Diem: 130 EUR/day (after taxes) for each day out of home. Average Total Pay: 6000 EUR/month (after taxes),Basic salary: 3500 EUR/month (after taxes). Per/Diem: 130 EUR/day (after taxes) for each day out of home. Average Total Pay: 5000 EUR/month (after taxes),23/Nov/19,23/Nov/19
2,https://www.pilotjobsnetwork.com/jobs/DHL_Air_UK,9Mar - UK (Ca) - DHL Air UK,172102,121646,9/Mar/23,9/Mar/23
3,https://www.pilotjobsnetwork.com/jobs/LOT_Polish_Airlines,9Mar - Poland (Ma) - LOT Polish Airlines,Longhaul (per month): minimum: 12860 PLN expect: 38500 PLN Shorhaul (per month): minimum: 12340 PLN expect: 35100 PLN,Longhaul (per month): minimum: 11020 PLN expect: 33000 PLN Shorhaul (per month): minimum: 10500 PLN expect: 29900 PLN,2/Dec/22,2/Dec/22
4,https://www.pilotjobsnetwork.com/jobs/Peach_Aviation,9Mar - Japan (Ma) - Peach Aviation,,,,
...,...,...,...,...,...,...
494,https://www.pilotjobsnetwork.com/jobs/Aeropartner,18Aug - Czech Republic (Fr) - Aeropartner,,2200Ã¢âÂ¬ gross,,18/Aug/19
495,https://www.pilotjobsnetwork.com/jobs/Airnorth,13Aug - Australia (Re) - Airnorth,,"45000 bras, 40 metro",,2/Jun/08
496,https://www.pilotjobsnetwork.com/jobs/FlyinGroup,13Aug - Belgium (Fr) - FlyinGroup,,,,
497,https://www.pilotjobsnetwork.com/jobs/MS_AVIATION,12Aug - Austria (Fr) - MS AVIATION,,,,


### ERRORS LIST ###

In [6]:
[salary for salary in list_salaries if type(salary) == str]

['https://www.pilotjobsnetwork.com/jobs/Air_CaraÃ¯bes_Atlantique',

### DF REWORK ###

In [7]:
# First name columns
df_salary.columns = ["URL", "AirlineName", "CaptMax", "CaptMin", "DateCaptMax", "DateCaptMin"]
df_salary

Unnamed: 0,URL,AirlineName,CaptMax,CaptMin,DateCaptMax,DateCaptMin
0,https://www.pilotjobsnetwork.com/jobs/British_Airways_Mainline,10Mar - UK (Le) - British Airways Mainline,"Short Haul Â£166,517 Long Haul Â£196,438","Short Haul & Long Haul Â£87,876 Based on year 1 on pay scale. Time to command: Short Haul: 1yr Long Haul: 18yrs",9/Mar/23,9/Mar/23
1,https://www.pilotjobsnetwork.com/jobs/Cargo_Air,10Mar - Bulgaria (Ca) - Cargo Air,Basic salary: 4000 EUR/month (after taxes). Per/Diem: 130 EUR/day (after taxes) for each day out of home. Average Total Pay: 6000 EUR/month (after taxes),Basic salary: 3500 EUR/month (after taxes). Per/Diem: 130 EUR/day (after taxes) for each day out of home. Average Total Pay: 5000 EUR/month (after taxes),23/Nov/19,23/Nov/19
2,https://www.pilotjobsnetwork.com/jobs/DHL_Air_UK,9Mar - UK (Ca) - DHL Air UK,172102,121646,9/Mar/23,9/Mar/23
3,https://www.pilotjobsnetwork.com/jobs/LOT_Polish_Airlines,9Mar - Poland (Ma) - LOT Polish Airlines,Longhaul (per month): minimum: 12860 PLN expect: 38500 PLN Shorhaul (per month): minimum: 12340 PLN expect: 35100 PLN,Longhaul (per month): minimum: 11020 PLN expect: 33000 PLN Shorhaul (per month): minimum: 10500 PLN expect: 29900 PLN,2/Dec/22,2/Dec/22
4,https://www.pilotjobsnetwork.com/jobs/Peach_Aviation,9Mar - Japan (Ma) - Peach Aviation,,,,
...,...,...,...,...,...,...
494,https://www.pilotjobsnetwork.com/jobs/Aeropartner,18Aug - Czech Republic (Fr) - Aeropartner,,2200Ã¢âÂ¬ gross,,18/Aug/19
495,https://www.pilotjobsnetwork.com/jobs/Airnorth,13Aug - Australia (Re) - Airnorth,,"45000 bras, 40 metro",,2/Jun/08
496,https://www.pilotjobsnetwork.com/jobs/FlyinGroup,13Aug - Belgium (Fr) - FlyinGroup,,,,
497,https://www.pilotjobsnetwork.com/jobs/MS_AVIATION,12Aug - Austria (Fr) - MS AVIATION,,,,


### PIVOT TABLE

In [8]:
df_stack = (df_salary.set_index(['URL', 'AirlineName', 'DateCaptMax', 'DateCaptMin'])
   .rename_axis(['Top/Base'], axis=1)
   .stack(dropna=False) # Put True to remove NaNs
   .reset_index())
df_stack.columns = ['URL', 'AirlineName', 'DateCaptMax', 'DateCaptMin', 'Top/Base', 'Salary_info']
df_stack

Unnamed: 0,URL,AirlineName,DateCaptMax,DateCaptMin,Top/Base,Salary_info
0,https://www.pilotjobsnetwork.com/jobs/British_Airways_Mainline,10Mar - UK (Le) - British Airways Mainline,9/Mar/23,9/Mar/23,CaptMax,"Short Haul Â£166,517 Long Haul Â£196,438"
1,https://www.pilotjobsnetwork.com/jobs/British_Airways_Mainline,10Mar - UK (Le) - British Airways Mainline,9/Mar/23,9/Mar/23,CaptMin,"Short Haul & Long Haul Â£87,876 Based on year 1 on pay scale. Time to command: Short Haul: 1yr Long Haul: 18yrs"
2,https://www.pilotjobsnetwork.com/jobs/Cargo_Air,10Mar - Bulgaria (Ca) - Cargo Air,23/Nov/19,23/Nov/19,CaptMax,Basic salary: 4000 EUR/month (after taxes). Per/Diem: 130 EUR/day (after taxes) for each day out of home. Average Total Pay: 6000 EUR/month (after taxes)
3,https://www.pilotjobsnetwork.com/jobs/Cargo_Air,10Mar - Bulgaria (Ca) - Cargo Air,23/Nov/19,23/Nov/19,CaptMin,Basic salary: 3500 EUR/month (after taxes). Per/Diem: 130 EUR/day (after taxes) for each day out of home. Average Total Pay: 5000 EUR/month (after taxes)
4,https://www.pilotjobsnetwork.com/jobs/DHL_Air_UK,9Mar - UK (Ca) - DHL Air UK,9/Mar/23,9/Mar/23,CaptMax,172102
...,...,...,...,...,...,...
993,https://www.pilotjobsnetwork.com/jobs/FlyinGroup,13Aug - Belgium (Fr) - FlyinGroup,,,CaptMin,
994,https://www.pilotjobsnetwork.com/jobs/MS_AVIATION,12Aug - Austria (Fr) - MS AVIATION,,,CaptMax,
995,https://www.pilotjobsnetwork.com/jobs/MS_AVIATION,12Aug - Austria (Fr) - MS AVIATION,,,CaptMin,
996,https://www.pilotjobsnetwork.com/jobs/ESMA_Aviation,12Aug - France (Re) - ESMA Aviation,,,CaptMax,


### Select date (min/max) based on "CaptMin/CaptMax"

In [9]:
# Create a unique column for date depending on Top/Base column value. Extract Year
df_stack['Date_reworked'] = np.where(df_stack['Top/Base'] == 'CaptMax', df_stack['DateCaptMax'], df_stack['DateCaptMin'])
df_stack['Date_reworked'] = pd.to_datetime(df_stack['Date_reworked'])
df_stack['Year'] = df_stack['Date_reworked'].dt.year
df_stack['Year'] = df_stack['Year'].astype('Int64')


# Split column AirlineName on " - " seperator
df_stack[["DateFromLink", "Country", "Name"]] = df_stack["AirlineName"].str.split(" - ", expand=True)

# # Cleaning columns. Droping non necessary
clean_order = ['URL', 'Country', 'Name', 'Year', 'Top/Base', 'Salary_info']
df_stack = df_stack[clean_order]
df_stack.to_excel('airline_list.xlsx')
df_stack

Unnamed: 0,URL,Country,Name,Year,Top/Base,Salary_info
0,https://www.pilotjobsnetwork.com/jobs/British_Airways_Mainline,UK (Le),British Airways Mainline,2023,CaptMax,"Short Haul Â£166,517 Long Haul Â£196,438"
1,https://www.pilotjobsnetwork.com/jobs/British_Airways_Mainline,UK (Le),British Airways Mainline,2023,CaptMin,"Short Haul & Long Haul Â£87,876 Based on year 1 on pay scale. Time to command: Short Haul: 1yr Long Haul: 18yrs"
2,https://www.pilotjobsnetwork.com/jobs/Cargo_Air,Bulgaria (Ca),Cargo Air,2019,CaptMax,Basic salary: 4000 EUR/month (after taxes). Per/Diem: 130 EUR/day (after taxes) for each day out of home. Average Total Pay: 6000 EUR/month (after taxes)
3,https://www.pilotjobsnetwork.com/jobs/Cargo_Air,Bulgaria (Ca),Cargo Air,2019,CaptMin,Basic salary: 3500 EUR/month (after taxes). Per/Diem: 130 EUR/day (after taxes) for each day out of home. Average Total Pay: 5000 EUR/month (after taxes)
4,https://www.pilotjobsnetwork.com/jobs/DHL_Air_UK,UK (Ca),DHL Air UK,2023,CaptMax,172102
...,...,...,...,...,...,...
993,https://www.pilotjobsnetwork.com/jobs/FlyinGroup,Belgium (Fr),FlyinGroup,,CaptMin,
994,https://www.pilotjobsnetwork.com/jobs/MS_AVIATION,Austria (Fr),MS AVIATION,,CaptMax,
995,https://www.pilotjobsnetwork.com/jobs/MS_AVIATION,Austria (Fr),MS AVIATION,,CaptMin,
996,https://www.pilotjobsnetwork.com/jobs/ESMA_Aviation,France (Re),ESMA Aviation,,CaptMax,


# GTP-3

In [10]:
import openai
import re

# Set up your API key
openai.api_key = "key"

# Define a function to extract salary information
def extract_salary(text):

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
        {"role": "system", "content": "You are a helpful assistant to extract relevant information from complex text."},
        {"role": "user", "content": "Calculate the annual salary in local currency of an airline pilot from this text from https://www.pilotjobsnetwork.com/: '14.540 x 13 = 189.021,43 euro TRE = +1.700 euro/month TRI = +1.600 euro/month LTC = +800 euro/month (+variable allowances for each check/training event)'"},
        {"role": "assistant", "content": "189,021.43 EUR annually"},
        {"role": "user", "content": "Calculate the annual salary in local currency of an airline pilot from this text from https://www.pilotjobsnetwork.com/: '4800'"},
        {"role": "assistant", "content": "57600 annually"},
        {"role": "user", "content": "Calculate the annual salary in local currency of an airline pilot from this text from https://www.pilotjobsnetwork.com/: 'about monthly 3150eur after tax'"},
        {"role": "assistant", "content": "37,800 EUR annually."},
        {"role": "user", "content": "Calculate the annual salary in local currency of an airline pilot from this text from https://www.pilotjobsnetwork.com/: 'see below'"},
        {"role": "assistant", "content": "salary not available."},
        {"role": "user", "content": f"Calculate the annual salary in local currency of an airline pilot from this text from https://www.pilotjobsnetwork.com/: '{text}'"}],
        max_tokens=15,
        temperature=0.2)
    completed_text = response['choices'][0]['message']['content']
    return completed_text

In [11]:
texts = [
    "+2.5% per year",
    "See above",
    "3200",
    "about 2900",
    "8.611,37 x 13 = 111.947,81 euro (or at least +1.300 euro/month after upgrade to CPT). That means all FO with high seniority, who used to be capped at rank 11, will move to CPT rank 4 after upgrade."
]

responses = []
for text in texts : 
    resp = extract_salary(text)
    responses.append(resp)

responses

AuthenticationError: Incorrect API key provided: key. You can find your API key at https://platform.openai.com/account/api-keys.