# SETUP

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize
import re
import pycountry
from countryinfo import CountryInfo
from currencies import Currency
import requests
import json
from bs4 import BeautifulSoup

# Airline and country

In [2]:
url = "https://www.pilotjobsnetwork.com/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

country_airlines = []

elements = soup.find_all("a", href=lambda href: href and "jobs" in href)
for element in elements :
    text = element.text.strip()
    country_airlines.append(text)

# country_airlines

In [3]:
airlines_by_country = []
for text in country_airlines:
    # Regex pattern to match the country and airline
    apattern = r'^[^-]*-[^-]*-\s*([^(]+)'
    cpattern = r'(\w+)\s\((\w{2})\)'
    

    country_match = re.search(cpattern, text)
    country = country_match.group(1)
    airline_match = re.search(apattern, text)
    airline = airline_match.group(1)

    # Append the airline and country to the list
    airlines_by_country.append((airline, country))

# Create the table
df = pd.DataFrame(airlines_by_country, columns=['Airline', 'Country'])
df['Country'] = df['Country'].replace('UK', 'United Kingdom')
# # Print the table
# df.head(10)

In [4]:
# Define countries standard name (ISO)
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return None
# Add a column to the dataframe with the ISO code of the country
df['ISO'] = df['Country'].apply(map_country_name)

In [5]:
df_airline = df

In [6]:
df.to_csv("metadata_ctry_airlines.csv")

# Average gross income by country and Currency code (frozen)

In [7]:
url_3 = "https://www.worlddata.info/average-income.php"
df_3 = pd.read_html(url_3, attrs={'class': 'std100 hover', 'id': 'tabsort'})
df_3 = df_3[0]
df_3['Country'] = df_3['Country'].str.replace(' \*', '', regex=True)

In [8]:
# Define countries standard name (ISO)
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return None
# Add a column to the dataframe with the ISO code of the country
df_3['ISO'] = df_3['Country'].apply(map_country_name)

In [9]:
# Define a function to get the ISO code for each country
def get_iso_code(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.alpha_3
    except LookupError:
        return None
df_3['ISO_code'] = df_3['ISO'].apply(get_iso_code)

In [10]:
def get_currency(country):
    try:
        currency = CountryInfo(country).currencies()
        return currency
    except LookupError:
        return None
df_3['Currency'] = df_3['ISO'].apply(get_currency)

def first_item_to_string(arr):
    return str(arr[0])
# df_3['Currency'] = df_3['Currency'].apply(first_item_to_string)

In [11]:
df_3.columns = ['Rank', 'Country', 'yearly_income', 'monthly_income', 'ISO', 'ISO_code', 'Currency']

In [12]:
df_3["yearly_income"] = df_3["yearly_income"].str.replace(",", "").str.replace("$", "").astype(float)
df_3["monthly_income"] = df_3["monthly_income"].str.replace(",", "").str.replace("$", "").astype(float)
df_3["Currency"] = df_3["Currency"].str.replace("[',\[\]]", "")

  df_3["yearly_income"] = df_3["yearly_income"].str.replace(",", "").str.replace("$", "").astype(float)
  df_3["monthly_income"] = df_3["monthly_income"].str.replace(",", "").str.replace("$", "").astype(float)
  df_3["Currency"] = df_3["Currency"].str.replace("[',\[\]]", "")


In [13]:
df_income_code_cur = df_3

In [14]:
df_3.to_csv("metadata_ctry_salary_2.csv")

# Currency Exchange rates

In [15]:
df_V1 = pd.read_csv("METADATA_V1.csv")
df_V1

Unnamed: 0,Country,year_income_gross,month_income_gross,ISO,Currency
0,Monaco,186080,15507,Monaco,EUR
1,Bermuda,122470,10206,Bermuda,BMD
2,Switzerland,90600,7550,Switzerland,CHF
3,Luxembourg,88190,7349,Luxembourg,EUR
4,Norway,83880,6990,Norway,NOK
...,...,...,...,...,...
64,Pakistan,1470,123,Pakistan,PKR
65,Nepal,1220,102,Nepal,NPR
66,Timor-Leste,1140,95,Timor-Leste,LCU
67,Ethiopia,940,78,Ethiopia,ETB


In [None]:
df_V1.to_json("ctry_salry.json", orient="records")

In [16]:
payload = {}
headers= {
  "apikey": "pPf6uwtTqHvmPDaWzHFzihcpKRAK4SC5"}
from_usd = "USD"
amount_convert = 1
def convert_usd(code_currency):
    try:
        response = requests.request("GET", f"https://api.apilayer.com/exchangerates_data/convert?to={code_currency}&from={from_usd}&amount={amount_convert}", headers=headers, data = payload)
        return json.loads(response.text)["info"]["rate"]
    except LookupError:
        return None

df_V1['XR'] = df_V1['Currency'].apply(convert_usd)

In [17]:
df_V2 = df_V1[["Currency", "XR"]]
df_V2 = df_V2.drop_duplicates()

In [18]:
df_XR = df_V2

In [19]:
df_V2.to_csv("XR_list.csv")


# Index cost of living (frozen)

In [20]:
url_4 = "https://www.numbeo.com/cost-of-living/rankings_by_country.jsp"
df_4 = pd.read_html(url_4, attrs={'id': 't2'})
df_4 = df_4[0]

In [21]:
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return name
# Add a column to the dataframe with the ISO code of the country
df_4['ISO'] = df_4['Country'].apply(map_country_name)

df_4= df_4.replace("Hong Kong (China)", "Hong Kong")
df_4= df_4.replace("Macao (China)", "Macao")
df_4= df_4.replace("Taiwan (China)", "Taiwan")
df_4= df_4.replace("Kosovo (Disputed Territory)", "Kosovo")
df_4 = df_4[['ISO','Cost of Living Index']]
df_4 = df_4.dropna(how='any',axis=0) 

In [22]:
df_index = df_4

In [23]:
df_4.to_csv("cost_of_living_index.csv")

# Average salary income tax (frozen)

In [24]:
df_5 = pd.read_csv("salary_tax.csv")
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return name
# Add a column to the dataframe with the ISO code of the country
df_5['ISO'] = df_5['Country'].apply(map_country_name)
df_5["Last"].astype(float)
df_5.columns = ["Country", "Income_tax", "ISO"]

In [25]:
df_tax = df_5

In [26]:
df_5.to_csv("income_tax_updated.csv")

# Inflation rate (frozen)

In [27]:
#Load inflation xls and add ISO countries
df_V3 = pd.read_csv("SQ_pred_database - 7 - Inflation.csv")
df_V3 

# Define a function to get the ISO name for each country
def get_iso_code(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return None
df_V3['ISO_name'] = df_V3['Country'].apply(get_iso_code)
df_V3

Unnamed: 0,ISO_code,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,ISO_name
0,ABW,Aruba,208,432,063,-237,042,048,-089,-047,358,426,122,074,604,Aruba
1,AFG,Afghanistan,218,118,644,739,467,-066,438,498,063,23,544,506,,Afghanistan
2,AGO,Angola,1448,1348,1028,878,73,916,3238,2984,1963,1708,2102,2385,2135,Angola
3,ALB,Albania,357,341,203,193,162,191,129,199,203,141,162,204,673,Albania
4,ARE,United Arab Emirates,088,088,066,109,234,407,162,197,306,-193,-208,018,522,United Arab Emirates
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,WSM,Samoa,-02,288,619,-021,-124,192,013,13,42,219,-157,-301,875,Samoa
189,YEM,"Yemen, Rep.",1117,1954,989,1097,81,394,5,304,276,10,2618,457,4385,
190,ZAF,South Africa,406,502,572,578,614,451,659,527,45,413,322,461,7,South Africa
191,ZMB,Zambia,85,644,659,699,782,1011,1786,658,749,98,1573,2202,1099,Zambia


In [28]:
df_inflation = df_V3

In [29]:
df_V3.to_csv("inlfation_rate_ISO.csv")

# Category

In [30]:
from tqdm.contrib.concurrent import thread_map
pd.set_option('display.max_colwidth', None)

#GET AIRLINES URLS
url = 'https://www.pilotjobsnetwork.com/'
max_urls = 5

def get_airlines_urls(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')
    for link in links:
        href = link.get('href')
        if href and 'jobs/' in href:
            airline_url = url + href
            yield airline_url
##GET INFO FROM PAGES
def get_airline_tables(airline_url):
    try:
        response = requests.get(airline_url)
        # Parse the HTML content of the response using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all text that contains the phrase "pilot jobs --->"
        matching_text = soup.find_all(text=lambda text: text and "pilot jobs --->" in text)
        # Regular expression to match text between parentheses
        for text in matching_text:
            paren_regex = r'\((.*?)\)'
            categories = []
            for text in matching_text:
                categories += re.findall(paren_regex, text)
            return [airline_url] + categories
    except NameError:
        return airline_url

# MAP IT 
list_category = list(thread_map(get_airline_tables, get_airlines_urls(url)))

#DF AS OUTPUT
df_category = pd.DataFrame([category for category in list_category if type(category) == list])

  from .autonotebook import tqdm as notebook_tqdm
  matching_text = soup.find_all(text=lambda text: text and "pilot jobs --->" in text)
501it [00:37, 13.24it/s]


In [31]:
# Define the regex pattern
regex_pattern = r'[^/]+(?=/?$)'

# Define a function to apply the regex pattern and replace underscores with spaces
def clean_url(url):
    match = re.search(regex_pattern, url)
    cleaned_url = match.group(0).replace('_', ' ')
    return cleaned_url
df_category[0] = df_category[0].apply(clean_url)
df_category.columns = ["Airline", "Category"]
df_category

Unnamed: 0,Airline,Category
0,Eurowings GmbH,Major/National/Low Cost
1,DC Aviation,Fractional/Corporate
2,ASL Airlines Belgium,Cargo
3,Greater Bay Airlines,Major/National/Low Cost
4,Eurowings Europe Ltd,Major/National/Low Cost
...,...,...
495,FlyinGroup,Fractional/Corporate
496,MS AVIATION,Fractional/Corporate
497,ESMA Aviation,Regional
498,Compass Airlines,Major/National/Low Cost


In [32]:
df_category.to_csv("category.csv")