# SETUP

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize
import re
import pycountry
from countryinfo import CountryInfo
from currencies import Currency
import requests
import json
from bs4 import BeautifulSoup

# Airline and country

In [2]:
url = "https://www.pilotjobsnetwork.com/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

country_airlines = []

elements = soup.find_all("a", href=lambda href: href and "jobs" in href)
for element in elements :
    text = element.text.strip()
    country_airlines.append(text)

# country_airlines

In [3]:
airlines_by_country = []
for text in country_airlines:
    # Regex pattern to match the country and airline
    apattern = r'^[^-]*-[^-]*-\s*([^(]+)'
    cpattern = r'(\w+)\s\((\w{2})\)'
    

    country_match = re.search(cpattern, text)
    country = country_match.group(1)
    airline_match = re.search(apattern, text)
    airline = airline_match.group(1)

    # Append the airline and country to the list
    airlines_by_country.append((airline, country))

# Create the table
df = pd.DataFrame(airlines_by_country, columns=['Airline', 'Country'])
df['Country'] = df['Country'].replace('UK', 'United Kingdom')
# # Print the table
# df.head(10)

In [4]:
# Define countries standard name (ISO)
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return None
# Add a column to the dataframe with the ISO code of the country
df['ISO'] = df['Country'].apply(map_country_name)

In [5]:
df_airline = df

In [6]:
df.to_csv("metadata_ctry_airlines.csv")

# Average gross income by country and Currency code (frozen)

In [7]:
url_3 = "https://www.worlddata.info/average-income.php"
df_3 = pd.read_html(url_3, attrs={'class': 'std100 hover', 'id': 'tabsort'})
df_3 = df_3[0]
df_3['Country'] = df_3['Country'].str.replace(' \*', '', regex=True)

In [8]:
# Define countries standard name (ISO)
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return None
# Add a column to the dataframe with the ISO code of the country
df_3['ISO'] = df_3['Country'].apply(map_country_name)

In [9]:
# Define a function to get the ISO code for each country
def get_iso_code(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.alpha_3
    except LookupError:
        return None
df_3['ISO_code'] = df_3['ISO'].apply(get_iso_code)

In [10]:
def get_currency(country):
    try:
        currency = CountryInfo(country).currencies()
        return currency
    except LookupError:
        return None
df_3['Currency'] = df_3['ISO'].apply(get_currency)

def first_item_to_string(arr):
    return str(arr[0])
# df_3['Currency'] = df_3['Currency'].apply(first_item_to_string)

In [11]:
df_3.columns = ['Rank', 'Country', 'yearly_income', 'monthly_income', 'ISO', 'ISO_code', 'Currency']

In [12]:
df_3["yearly_income"] = df_3["yearly_income"].str.replace(",", "").str.replace("$", "").astype(float)
df_3["monthly_income"] = df_3["monthly_income"].str.replace(",", "").str.replace("$", "").astype(float)
df_3["Currency"] = df_3["Currency"].str.replace("[',\[\]]", "")

  df_3["yearly_income"] = df_3["yearly_income"].str.replace(",", "").str.replace("$", "").astype(float)
  df_3["monthly_income"] = df_3["monthly_income"].str.replace(",", "").str.replace("$", "").astype(float)
  df_3["Currency"] = df_3["Currency"].str.replace("[',\[\]]", "")


In [13]:
df_income_code_cur = df_3

In [14]:
df_3.to_csv("metadata_ctry_salary_2.csv")

# Currency Exchange rates

In [15]:
df_V1 = pd.read_csv("METADATA_V1.csv")
df_V1

Unnamed: 0,Country,year_income_gross,month_income_gross,ISO,Currency
0,Monaco,186080,15507,Monaco,EUR
1,Bermuda,122470,10206,Bermuda,BMD
2,Switzerland,90600,7550,Switzerland,CHF
3,Luxembourg,88190,7349,Luxembourg,EUR
4,Norway,83880,6990,Norway,NOK
...,...,...,...,...,...
64,Pakistan,1470,123,Pakistan,PKR
65,Nepal,1220,102,Nepal,NPR
66,Timor-Leste,1140,95,Timor-Leste,LCU
67,Ethiopia,940,78,Ethiopia,ETB


In [16]:
df_V1.to_json("ctry_salry.json", orient="records")

In [17]:
payload = {}
headers= {
  "apikey": "pPf6uwtTqHvmPDaWzHFzihcpKRAK4SC5"}
from_usd = "USD"
amount_convert = 1
def convert_usd(code_currency):
    try:
        response = requests.request("GET", f"https://api.apilayer.com/exchangerates_data/convert?to={code_currency}&from={from_usd}&amount={amount_convert}", headers=headers, data = payload)
        return json.loads(response.text)["info"]["rate"]
    except LookupError:
        return None

df_V1['XR'] = df_V1['Currency'].apply(convert_usd)

In [18]:
df_V2 = df_V1[["Currency", "XR"]]
df_V2 = df_V2.drop_duplicates()

In [19]:
df_XR = df_V2

In [20]:
df_V2.to_csv("XR_list.csv")


# Index cost of living (frozen)

In [21]:
url_4 = "https://www.numbeo.com/cost-of-living/rankings_by_country.jsp"
df_4 = pd.read_html(url_4, attrs={'id': 't2'})
df_4 = df_4[0]

In [22]:
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return name
# Add a column to the dataframe with the ISO code of the country
df_4['ISO'] = df_4['Country'].apply(map_country_name)

df_4= df_4.replace("Hong Kong (China)", "Hong Kong")
df_4= df_4.replace("Macao (China)", "Macao")
df_4= df_4.replace("Taiwan (China)", "Taiwan")
df_4= df_4.replace("Kosovo (Disputed Territory)", "Kosovo")
df_4 = df_4[['ISO','Cost of Living Index']]
df_4 = df_4.dropna(how='any',axis=0) 

In [23]:
df_index = df_4

In [24]:
df_4.to_csv("cost_of_living_index.csv")

# Average salary income tax (frozen)

In [25]:
df_5 = pd.read_csv("salary_tax.csv")
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return name
# Add a column to the dataframe with the ISO code of the country
df_5['ISO'] = df_5['Country'].apply(map_country_name)
df_5["Last"].astype(float)
df_5.columns = ["Country", "Income_tax", "ISO"]

In [26]:
df_tax = df_5

In [27]:
df_5.to_csv("income_tax_updated.csv")

# Inflation rate (frozen)

In [104]:
#Load inflation xls and add ISO countries
df_V3 = pd.read_csv("SQ_pred_database - 7 - Inflation.csv")
df_V3 

# Define a function to get the ISO name for each country
def get_iso_code(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return None
df_V3['ISO_name'] = df_V3['Country'].apply(get_iso_code)
df_V3

Unnamed: 0,ISO_code,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,ISO_name
0,ABW,Aruba,208,432,063,-237,042,048,-089,-047,358,426,122,074,604,Aruba
1,AFG,Afghanistan,218,118,644,739,467,-066,438,498,063,23,544,506,,Afghanistan
2,AGO,Angola,1448,1348,1028,878,73,916,3238,2984,1963,1708,2102,2385,2135,Angola
3,ALB,Albania,357,341,203,193,162,191,129,199,203,141,162,204,673,Albania
4,ARE,United Arab Emirates,088,088,066,109,234,407,162,197,306,-193,-208,018,522,United Arab Emirates
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,WSM,Samoa,-02,288,619,-021,-124,192,013,13,42,219,-157,-301,875,Samoa
189,YEM,"Yemen, Rep.",1117,1954,989,1097,81,394,5,304,276,10,2618,457,4385,
190,ZAF,South Africa,406,502,572,578,614,451,659,527,45,413,322,461,7,South Africa
191,ZMB,Zambia,85,644,659,699,782,1011,1786,658,749,98,1573,2202,1099,Zambia


In [105]:
df_inflation = df_V3

In [30]:
df_V3.to_csv("inlfation_rate_ISO.csv")

# Category

In [31]:
from tqdm.contrib.concurrent import thread_map
pd.set_option('display.max_colwidth', None)

#GET AIRLINES URLS
url = 'https://www.pilotjobsnetwork.com/'
max_urls = 5

def get_airlines_urls(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')
    for link in links:
        href = link.get('href')
        if href and 'jobs/' in href:
            airline_url = url + href
            yield airline_url
##GET INFO FROM PAGES
def get_airline_tables(airline_url):
    try:
        response = requests.get(airline_url)
        # Parse the HTML content of the response using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all text that contains the phrase "pilot jobs --->"
        matching_text = soup.find_all(text=lambda text: text and "pilot jobs --->" in text)
        # Regular expression to match text between parentheses
        for text in matching_text:
            paren_regex = r'\((.*?)\)'
            categories = []
            for text in matching_text:
                categories += re.findall(paren_regex, text)
            return [airline_url] + categories
    except NameError:
        return airline_url

# MAP IT 
list_category = list(thread_map(get_airline_tables, get_airlines_urls(url)))

#DF AS OUTPUT
df_category = pd.DataFrame([category for category in list_category if type(category) == list])

  from .autonotebook import tqdm as notebook_tqdm
  matching_text = soup.find_all(text=lambda text: text and "pilot jobs --->" in text)
501it [00:33, 14.86it/s]


In [32]:
# Define the regex pattern
regex_pattern = r'[^/]+(?=/?$)'

# Define a function to apply the regex pattern and replace underscores with spaces
def clean_url(url):
    match = re.search(regex_pattern, url)
    cleaned_url = match.group(0).replace('_', ' ')
    return cleaned_url
df_category[0] = df_category[0].apply(clean_url)
df_category.columns = ["Airline", "Category"]
df_category

Unnamed: 0,Airline,Category
0,British Airways Mainline,Legacy
1,Cargo Air,Cargo
2,DHL Air UK,Cargo
3,LOT Polish Airlines,Major/National/Low Cost
4,Peach Aviation,Major/National/Low Cost
...,...,...
495,Aeropartner,Fractional/Corporate
496,Airnorth,Regional
497,FlyinGroup,Fractional/Corporate
498,MS AVIATION,Fractional/Corporate


In [33]:
df_category.to_csv("category.csv")

# Conso df (to train model)

## Loading PPJN data scraping

In [75]:
df_extract_ppjn = pd.read_csv("PPJN_extract_salary.csv")
df_extract_ppjn.head()

#fill in blank with gross + remove Nan salary + convert salary to numeric
df_extract_ppjn["gross-net"] = df_extract_ppjn["gross-net"].fillna("gross")
df_extract_ppjn = df_extract_ppjn.dropna()
df_extract_ppjn["salary"] = df_extract_ppjn["salary"].str.replace(',', '.')
df_extract_ppjn["salary"] = pd.to_numeric(df_extract_ppjn["salary"])
df_extract_ppjn["Year"] = df_extract_ppjn["Year"].astype(int)

## Mapping with metadata

In [76]:
#Create df with all features
df_conso = df_extract_ppjn[["Airline", "Year", "Top/Base", "salary", "currency", "gross-net"]]

#Add category
cat_map = dict(zip(df_category['Airline'], df_category['Category']))
df_conso["Category"] = df_conso["Airline"].map(cat_map)

#Add country
country_map = dict(zip(df_airline['Airline'], df_airline['ISO']))
df_conso["Country"] = df_conso["Airline"].map(country_map)

#Add income
income_map = dict(zip(df_income_code_cur["ISO"], df_income_code_cur["yearly_income"]))
df_conso["Income"] = df_conso["Country"].map(income_map)

#Add cost of living
index_map = dict(zip(df_index["ISO"], df_index["Cost of Living Index"]))
df_conso["Index"] = df_conso["Country"].map(index_map)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_conso["Category"] = df_conso["Airline"].map(cat_map)


## Standardize salary 

In [77]:
# Standardize the salary in USD/GROSS/YEARLY/EST.2023
def standard_salary(x) :
    #rate
    xr_dict = dict(zip(df_XR.Currency, df_XR.XR))
    df_conso['new_salary'] = df_conso.apply(lambda x: x['salary'] * xr_dict[x['currency']] if x['currency'] != 'USD' else x['salary'], axis=1)
    #gross
    tax_dict = dict(zip(df_tax.ISO, df_tax.Income_tax))
    df_conso['new_salary_2'] = df_conso.apply(lambda x: x['new_salary'] * (1+tax_dict[x['Country']]/100) if x['gross-net'] == 'net' else x['new_salary'], axis=1)
    #inflation
    
    

In [106]:
for x in range(2010, 2022):
    df_inflation[f"{x}"] = df_inflation[f"{x}"].str.replace(',', '.').astype(float)
df_inflation

Unnamed: 0,ISO_code,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,ISO_name
0,ABW,Aruba,2.08,4.32,0.63,-2.37,0.42,0.48,-0.89,-0.47,3.58,4.26,1.22,0.74,604,Aruba
1,AFG,Afghanistan,2.18,11.80,6.44,7.39,4.67,-0.66,4.38,4.98,0.63,2.30,5.44,5.06,,Afghanistan
2,AGO,Angola,14.48,13.48,10.28,8.78,7.30,9.16,32.38,29.84,19.63,17.08,21.02,23.85,2135,Angola
3,ALB,Albania,3.57,3.41,2.03,1.93,1.62,1.91,1.29,1.99,2.03,1.41,1.62,2.04,673,Albania
4,ARE,United Arab Emirates,0.88,0.88,0.66,1.09,2.34,4.07,1.62,1.97,3.06,-1.93,-2.08,0.18,522,United Arab Emirates
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,WSM,Samoa,-0.20,2.88,6.19,-0.21,-1.24,1.92,0.13,1.30,4.20,2.19,-1.57,-3.01,875,Samoa
189,YEM,"Yemen, Rep.",11.17,19.54,9.89,10.97,8.10,39.40,5.00,30.40,27.60,10.00,26.18,45.70,4385,
190,ZAF,South Africa,4.06,5.02,5.72,5.78,6.14,4.51,6.59,5.27,4.50,4.13,3.22,4.61,7,South Africa
191,ZMB,Zambia,8.50,6.44,6.59,6.99,7.82,10.11,17.86,6.58,7.49,9.80,15.73,22.02,1099,Zambia


In [109]:
# Define a dictionary to store the cumulative inflation rates by country
cumulative_inflation = {}

# Loop through the rows in the original dataframe
for i, row in df_conso.iterrows():
    # Get the country and year from the current row
    country = row['Country']
    year = row['Year']
    
    # If this is the first row for this country, set the cumulative inflation to 1
    if country not in cumulative_inflation:
        cumulative_inflation[country] = 1
    
    # Calculate the cumulative inflation rate for this country up to the current year
    cumulative_rate = 1
    for y in range(2010, year):
        cumulative_rate *= (1 + df_inflation.loc[(df_inflation['ISO_name'] == country), str(y)])

    # Calculate the updated salary for 2023
    updated_salary = row['new_salary_2'] * cumulative_rate ** (2023 - year)
    
    # Update the original dataframe with the updated salary
    df_conso.at[i, 'updated salary for 2023'] = updated_salary



TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [80]:
df_conso

Unnamed: 0,Airline,Year,Top/Base,salary,currency,gross-net,Category,Country,Income,Index,new_salary,new_salary_2
0,British Airways Mainline,2023,CaptMax,181477.5,GBP,gross,Legacy,United Kingdom,44480.0,61.5,217301.889504,217301.889504
1,British Airways Mainline,2023,CaptMin,87876.0,GBP,gross,Legacy,United Kingdom,44480.0,61.5,105223.076370,105223.076370
2,Cargo Air,2019,CaptMax,48000.0,EUR,net,Cargo,Bulgaria,,40.5,50847.996271,55932.795898
3,Cargo Air,2019,CaptMin,42000.0,EUR,net,Cargo,Bulgaria,,40.5,44491.996737,48941.196411
4,DHL Air UK,2023,CaptMax,172102.0,GBP,gross,Cargo,United Kingdom,44480.0,61.5,206075.628038,206075.628038
...,...,...,...,...,...,...,...,...,...,...,...,...
101,Swiss International Air Lines,2015,CaptMin,107000.0,CHF,gross,Legacy,Switzerland,90600.0,114.2,115197.450584,115197.450584
102,Atlantic Airways,2023,CaptMax,86528.0,EUR,gross,Regional,Iceland,63460.0,83.3,91661.987945,91661.987945
103,Atlantic Airways,2023,CaptMin,54824.0,EUR,gross,Regional,Iceland,63460.0,83.3,58076.886408,58076.886408
104,Exxaero,2020,CaptMax,78000.0,EUR,gross,Fractional/Corporate,Netherlands,55200.0,68.6,82627.993941,82627.993941


In [88]:
df_inflation

Unnamed: 0,ISO_code,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,ISO_name
0,ABW,Aruba,208,432,063,-237,042,048,-089,-047,358,426,122,074,604,Aruba
1,AFG,Afghanistan,218,118,644,739,467,-066,438,498,063,23,544,506,,Afghanistan
2,AGO,Angola,1448,1348,1028,878,73,916,3238,2984,1963,1708,2102,2385,2135,Angola
3,ALB,Albania,357,341,203,193,162,191,129,199,203,141,162,204,673,Albania
4,ARE,United Arab Emirates,088,088,066,109,234,407,162,197,306,-193,-208,018,522,United Arab Emirates
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,WSM,Samoa,-02,288,619,-021,-124,192,013,13,42,219,-157,-301,875,Samoa
189,YEM,"Yemen, Rep.",1117,1954,989,1097,81,394,5,304,276,10,2618,457,4385,
190,ZAF,South Africa,406,502,572,578,614,451,659,527,45,413,322,461,7,South Africa
191,ZMB,Zambia,85,644,659,699,782,1011,1786,658,749,98,1573,2202,1099,Zambia


In [90]:
# for col in df_inflation.columns:
#     df_inflation[col] = pd.to_numeric(df_inflation[col], errors='coerce')

# df_inflation

Unnamed: 0,ISO_code,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,ISO_name
0,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,,,,,,,,,,,,,,,,
189,,,,,,,,,5.0,,,10.0,,,,
190,,,,,,,,,,,,,,,7.0,
191,,,,,,,,,,,,,,,,


Unnamed: 0,ISO_code,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,ISO_name
0,ABW,Aruba,2.08,4.32,0.63,-2.37,0.42,0.48,-0.89,-0.47,3.58,4.26,1.22,0.74,604,Aruba
1,AFG,Afghanistan,2.18,11.80,6.44,7.39,4.67,-0.66,4.38,4.98,0.63,2.30,5.44,5.06,,Afghanistan
2,AGO,Angola,14.48,13.48,10.28,8.78,7.30,9.16,32.38,29.84,19.63,17.08,21.02,23.85,2135,Angola
3,ALB,Albania,3.57,3.41,2.03,1.93,1.62,1.91,1.29,1.99,2.03,1.41,1.62,2.04,673,Albania
4,ARE,United Arab Emirates,0.88,0.88,0.66,1.09,2.34,4.07,1.62,1.97,3.06,-1.93,-2.08,0.18,522,United Arab Emirates
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,WSM,Samoa,-0.20,2.88,6.19,-0.21,-1.24,1.92,0.13,1.30,4.20,2.19,-1.57,-3.01,875,Samoa
189,YEM,"Yemen, Rep.",11.17,19.54,9.89,10.97,8.10,39.40,5.00,30.40,27.60,10.00,26.18,45.70,4385,
190,ZAF,South Africa,4.06,5.02,5.72,5.78,6.14,4.51,6.59,5.27,4.50,4.13,3.22,4.61,7,South Africa
191,ZMB,Zambia,8.50,6.44,6.59,6.99,7.82,10.11,17.86,6.58,7.49,9.80,15.73,22.02,1099,Zambia
