# SETUP

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize
import re
import pycountry
from countryinfo import CountryInfo
from currencies import Currency

# Scraping metadata PPJN and clean country names

In [3]:
import requests
from bs4 import BeautifulSoup

url = "https://www.pilotjobsnetwork.com/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

country_airlines = []

elements = soup.find_all("a", href=lambda href: href and "jobs" in href)
for element in elements :
    text = element.text.strip()
    country_airlines.append(text)

# country_airlines

In [4]:
airlines_by_country = []
for text in country_airlines:
    # Regex pattern to match the country and airline
    apattern = r'^[^-]*-[^-]*-\s*([^(]+)'
    cpattern = r'(\w+)\s\((\w{2})\)'
    

    country_match = re.search(cpattern, text)
    country = country_match.group(1)
    airline_match = re.search(apattern, text)
    airline = airline_match.group(1)

    # Append the airline and country to the list
    airlines_by_country.append((airline, country))

# Create the table
df = pd.DataFrame(airlines_by_country, columns=['Airline', 'Country'])
df['Country'] = df['Country'].replace('UK', 'United Kingdom')
# # Print the table
# df.head(10)

In [5]:
# Define countries standard name (ISO)
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return None
# Add a column to the dataframe with the ISO code of the country
df['ISO'] = df['Country'].apply(map_country_name)

In [6]:
df.head(10)

Unnamed: 0,Airline,Country,ISO
0,Aerowest,Germany,Germany
1,Air Hamburg Luftverkehrsgesellschaft mbH,Germany,Germany
2,Austrian,Austria,Austria
3,Heston Airlines,Lithuania,Lithuania
4,HOP,France,France
5,Aegean Airlines,Greece,Greece
6,Cargo Air,Bulgaria,Bulgaria
7,Air Japan,Japan,Japan
8,DC Aviation,Germany,Germany
9,Virgin Atlantic Airways,United Kingdom,United Kingdom


In [7]:
df.to_csv("metadata_ctry_airlines.csv")

# Extract gross income per country and Currency

In [8]:
url_3 = "https://www.worlddata.info/average-income.php"
df_3 = pd.read_html(url_3, attrs={'class': 'std100 hover', 'id': 'tabsort'})
df_3 = df_3[0]
df_3['Country'] = df_3['Country'].str.replace(' \*', '', regex=True)

In [9]:
df_3

Unnamed: 0,Rank,Country,Avg. income per year,Avg. income per month
0,1,Monaco,"186,080 $","15,507 $"
1,2,Bermuda,"122,470 $","10,206 $"
2,3,Switzerland,"90,600 $","7,550 $"
3,4,Luxembourg,"88,190 $","7,349 $"
4,5,Norway,"83,880 $","6,990 $"
...,...,...,...,...
64,65,Pakistan,"1,470 $",123 $
65,66,Nepal,"1,220 $",102 $
66,67,Timor-Leste,"1,140 $",95 $
67,68,Ethiopia,940 $,78 $


In [10]:
# Define countries standard name (ISO)
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return None
# Add a column to the dataframe with the ISO code of the country
df_3['ISO'] = df_3['Country'].apply(map_country_name)

In [11]:
# Define a function to get the ISO code for each country
def get_iso_code(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.alpha_3
    except LookupError:
        return None
df_3['ISO_code'] = df_3['ISO'].apply(get_iso_code)

In [12]:
def get_currency(country):
    try:
        currency = CountryInfo(country).currencies()
        return currency
    except LookupError:
        return None
df_3['Currency'] = df_3['ISO'].apply(get_currency)

def first_item_to_string(arr):
    return str(arr[0])
# df_3['Currency'] = df_3['Currency'].apply(first_item_to_string)

In [13]:
df_3.rename(columns={'Avg. income per year': 'year_income', 'Avg. income per month': 'month_income'}, inplace=True)

In [14]:
df_3.columns

Index(['Rank', 'Country', 'Avg. income per year', 'Avg. income per month',
       'ISO', 'ISO_code', 'Currency'],
      dtype='object')

In [15]:
df_3['Currency'] = df_3['Currency'].astype(str)
df_3['year_income'] = df_3['year_income'].astype(str)
df_3['month_income'] = df_3['month_income'].astype(str)

KeyError: 'year_income'

In [None]:
# df_3["year_income"] = pd.to_numeric(df_3["year_income"].str.replace(",", "").str.replace("$", ""))
# df_3["month_income"] = pd.to_numeric(df_3["month_income"].str.replace(",", "").str.replace("$", ""))
# df_3["Currency"] = df_3["Currency"].str.replace("[", "").str.replace("]", "").str.replace("'", "")

def clean_income(df):
    df["year_income"] = df["year_income"].str.replace(",", "").str.replace("$", "").astype(float)
    df["month_income"] = df["month_income"].str.replace(",", "").str.replace("$", "").astype(float)
    df["Currency"] = df["Currency"].str.replace("[',\[\]]", "")
    return df

df_3 = clean_income(df_3)

KeyError: 'year_income'

In [None]:
df_3.to_csv("metadata_ctry_salary_2.csv")

In [None]:
df_3

Unnamed: 0,Rank,Country,year_income,month_income,ISO,ISO_code,Currency
0,1,Monaco,"186,080 $","15,507 $",Monaco,MCO,EUR
1,2,Bermuda,"122,470 $","10,206 $",Bermuda,BMU,BMD
2,3,Switzerland,"90,600 $","7,550 $",Switzerland,CHE,"CHE, CHF, CHW"
3,4,Luxembourg,"88,190 $","7,349 $",Luxembourg,LUX,EUR
4,5,Norway,"83,880 $","6,990 $",Norway,NOR,NOK
...,...,...,...,...,...,...,...
64,65,Pakistan,"1,470 $",123 $,Pakistan,PAK,PKR
65,66,Nepal,"1,220 $",102 $,Nepal,NPL,NPR
66,67,Timor-Leste,"1,140 $",95 $,Timor-Leste,TLS,
67,68,Ethiopia,940 $,78 $,Ethiopia,ETH,ETB


# Load clean database and get the XR for currencies

In [None]:
df_V1 = pd.read_csv("METADATA_V1.csv")
df_V1

Unnamed: 0,Country,year_income_gross,month_income_gross,ISO,Currency
0,Monaco,186080,15507,Monaco,EUR
1,Bermuda,122470,10206,Bermuda,BMD
2,Switzerland,90600,7550,Switzerland,CHF
3,Luxembourg,88190,7349,Luxembourg,EUR
4,Norway,83880,6990,Norway,NOK
...,...,...,...,...,...
64,Pakistan,1470,123,Pakistan,PKR
65,Nepal,1220,102,Nepal,NPR
66,Timor-Leste,1140,95,Timor-Leste,LCU
67,Ethiopia,940,78,Ethiopia,ETB


In [None]:
from forex_python.converter import CurrencyRates
# Define a function to get XR vs Dollar

def get_rate(cur):
    try:
        c = CurrencyRates()
        obj = datetime.datetime(2014, 5, 23, 18, 36, 28, 151012)
        rate = c.get_rate('USD', cur)
        return rate
    except LookupError:
        return None
df_V1['rate_vs_USD'] = df_V1['Currency'].apply(get_rate)
df_V1

SyntaxError: '(' was never closed (2229328224.py, line 7)

# Main data

In [None]:
url_3 = "https://www.pilotjobsnetwork.com/jobs/Aegean_Airlines"
df_3 = pd.read_html(url_3, attrs={'class': 'col-2'})
df_3 = df_3[0]

In [None]:
df_3

Unnamed: 0,0,1,2
0,,,
1,Brief Payscale (please state whether before or...,Brief Payscale (please state whether before or...,last update
2,Capt top,5000/month net,16/Feb/15
3,Capt base,3200/month net,16/Feb/15
4,FO top,2100/Month Gross,19/Mar/18
5,FO base,1800,5/Dec/22
6,SO top,,
7,SO base,,
8,FE top,,
9,FE base,,
