# SETUP

In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize
import re
import pycountry
from countryinfo import CountryInfo
from currencies import Currency
import requests
import json
from bs4 import BeautifulSoup

# Scraping metadata PPJN and clean country names

In [2]:
url = "https://www.pilotjobsnetwork.com/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

country_airlines = []

elements = soup.find_all("a", href=lambda href: href and "jobs" in href)
for element in elements :
    text = element.text.strip()
    country_airlines.append(text)

# country_airlines

In [3]:
airlines_by_country = []
for text in country_airlines:
    # Regex pattern to match the country and airline
    apattern = r'^[^-]*-[^-]*-\s*([^(]+)'
    cpattern = r'(\w+)\s\((\w{2})\)'
    

    country_match = re.search(cpattern, text)
    country = country_match.group(1)
    airline_match = re.search(apattern, text)
    airline = airline_match.group(1)

    # Append the airline and country to the list
    airlines_by_country.append((airline, country))

# Create the table
df = pd.DataFrame(airlines_by_country, columns=['Airline', 'Country'])
df['Country'] = df['Country'].replace('UK', 'United Kingdom')
# # Print the table
# df.head(10)

In [4]:
# Define countries standard name (ISO)
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return None
# Add a column to the dataframe with the ISO code of the country
df['ISO'] = df['Country'].apply(map_country_name)

In [5]:
df.head(10)

Unnamed: 0,Airline,Country,ISO
0,Eurowings GmbH,Germany,Germany
1,Greater Bay Airlines,Kong,Hong Kong
2,Eurowings Europe Ltd,Malta,Malta
3,Avion Express,Lithuania,Lithuania
4,Corendon Airlines,Turkey,Turkey
5,Hifly,Portugal,Portugal
6,Air Japan,Japan,Japan
7,Air Hamburg Luftverkehrsgesellschaft mbH,Germany,Germany
8,Aerowest,Germany,Germany
9,Austrian,Austria,Austria


In [6]:
df.to_csv("metadata_ctry_airlines.csv")

# Extract gross income per country and Currency

In [7]:
url_3 = "https://www.worlddata.info/average-income.php"
df_3 = pd.read_html(url_3, attrs={'class': 'std100 hover', 'id': 'tabsort'})
df_3 = df_3[0]
df_3['Country'] = df_3['Country'].str.replace(' \*', '', regex=True)

In [8]:
# Define countries standard name (ISO)
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return None
# Add a column to the dataframe with the ISO code of the country
df_3['ISO'] = df_3['Country'].apply(map_country_name)

In [9]:
# Define a function to get the ISO code for each country
def get_iso_code(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.alpha_3
    except LookupError:
        return None
df_3['ISO_code'] = df_3['ISO'].apply(get_iso_code)

In [10]:
def get_currency(country):
    try:
        currency = CountryInfo(country).currencies()
        return currency
    except LookupError:
        return None
df_3['Currency'] = df_3['ISO'].apply(get_currency)

def first_item_to_string(arr):
    return str(arr[0])
# df_3['Currency'] = df_3['Currency'].apply(first_item_to_string)

In [11]:
df_3.columns

Index(['Rank', 'Country', 'Avg. income per year', 'Avg. income per month',
       'ISO', 'ISO_code', 'Currency'],
      dtype='object')

In [12]:
df_3.columns = ['Rank', 'Country', 'yearly_income', 'monthly_income', 'ISO', 'ISO_code', 'Currency']

In [13]:
df_3["yearly_income"] = df_3["yearly_income"].str.replace(",", "").str.replace("$", "").astype(float)
df_3["monthly_income"] = df_3["monthly_income"].str.replace(",", "").str.replace("$", "").astype(float)
df_3["Currency"] = df_3["Currency"].str.replace("[',\[\]]", "")

  df_3["yearly_income"] = df_3["yearly_income"].str.replace(",", "").str.replace("$", "").astype(float)
  df_3["monthly_income"] = df_3["monthly_income"].str.replace(",", "").str.replace("$", "").astype(float)
  df_3["Currency"] = df_3["Currency"].str.replace("[',\[\]]", "")


In [14]:
df_3.to_csv("metadata_ctry_salary_2.csv")

In [15]:
df_3

Unnamed: 0,Rank,Country,yearly_income,monthly_income,ISO,ISO_code,Currency
0,1,Monaco,186080.0,15507.0,Monaco,MCO,
1,2,Bermuda,122470.0,10206.0,Bermuda,BMU,
2,3,Switzerland,90600.0,7550.0,Switzerland,CHE,
3,4,Luxembourg,88190.0,7349.0,Luxembourg,LUX,
4,5,Norway,83880.0,6990.0,Norway,NOR,
...,...,...,...,...,...,...,...
64,65,Cambodia,1580.0,132.0,Cambodia,KHM,
65,66,Pakistan,1470.0,123.0,Pakistan,PAK,
66,67,Nepal,1220.0,102.0,Nepal,NPL,
67,68,Timor-Leste,1140.0,95.0,Timor-Leste,TLS,


# Load clean database and get the XR for currencies

In [16]:
df_V1 = pd.read_csv("METADATA_V1.csv")
df_V1

Unnamed: 0,Country,year_income_gross,month_income_gross,ISO,Currency
0,Monaco,186080,15507,Monaco,EUR
1,Bermuda,122470,10206,Bermuda,BMD
2,Switzerland,90600,7550,Switzerland,CHF
3,Luxembourg,88190,7349,Luxembourg,EUR
4,Norway,83880,6990,Norway,NOK
...,...,...,...,...,...
64,Pakistan,1470,123,Pakistan,PKR
65,Nepal,1220,102,Nepal,NPR
66,Timor-Leste,1140,95,Timor-Leste,LCU
67,Ethiopia,940,78,Ethiopia,ETB


In [17]:
# from forex_python.converter import CurrencyRates
# import pandas as pd
# import datetime
# c = CurrencyRates()

# def get_rate(x):
#     dt = datetime.datetime(2023, 3, 3, 10, 00, 00, 100000)
#     try:
#         op = c.get_rate(x, 'USD', dt)
#     except Exception as re:
#         print(re)
#         op=None
#     return op

# df_V1['Rate'] = df_V1['Currency'].apply(get_rate)

# df_V1

Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency Rates Source Not Ready
Currency

Unnamed: 0,Country,year_income_gross,month_income_gross,ISO,Currency,Rate
0,Monaco,186080,15507,Monaco,EUR,
1,Bermuda,122470,10206,Bermuda,BMD,
2,Switzerland,90600,7550,Switzerland,CHF,
3,Luxembourg,88190,7349,Luxembourg,EUR,
4,Norway,83880,6990,Norway,NOK,
...,...,...,...,...,...,...
64,Pakistan,1470,123,Pakistan,PKR,
65,Nepal,1220,102,Nepal,NPR,
66,Timor-Leste,1140,95,Timor-Leste,LCU,
67,Ethiopia,940,78,Ethiopia,ETB,


In [40]:
payload = {}
headers= {
  "apikey": "pPf6uwtTqHvmPDaWzHFzihcpKRAK4SC5"}
from_usd = "USD"
amount_convert = 1
def convert_usd(code_currency):
    try:
        response = requests.request("GET", f"https://api.apilayer.com/exchangerates_data/convert?to={code_currency}&from={from_usd}&amount={amount_convert}", headers=headers, data = payload)
        return json.loads(response.text)["info"]["rate"]
    except LookupError:
        return None

df_V1['XR'] = df_V1['Currency'].apply(convert_usd)

In [42]:
df_V2 = df_V1[["Currency", "XR"]]
df_V2 = df_V2.drop_duplicates()

Unnamed: 0,Currency,XR
0,EUR,0.937799
1,BMD,1.0
2,CHF,0.932565
4,NOK,10.474465
6,USD,1.0
7,DKK,6.97854
8,SGD,1.345925
9,ISK,140.579907
10,QAR,3.640992
11,SEK,10.503835


In [43]:
df_V2.to_csv("XR_list.csv")


# Get the index cost of living from numbeo

In [19]:
url_4 = "https://www.numbeo.com/cost-of-living/rankings_by_country.jsp"
df_4 = pd.read_html(url_4, attrs={'id': 't2'})
df_4 = df_4[0]

In [20]:
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return name
# Add a column to the dataframe with the ISO code of the country
df_4['ISO'] = df_4['Country'].apply(map_country_name)

df_4= df_4.replace("Hong Kong (China)", "Hong Kong")
df_4= df_4.replace("Macao (China)", "Macao")
df_4= df_4.replace("Taiwan (China)", "Taiwan")
df_4= df_4.replace("Kosovo (Disputed Territory)", "Kosovo")
df_4 = df_4[['ISO','Cost of Living Index']]
df_4 = df_4.dropna(how='any',axis=0) 

In [21]:
df_4.to_csv("cost_of_living_index.csv")

# Get the average salary income tax

In [22]:
# url_5 = "https://tradingeconomics.com/country-list/personal-income-tax-rate"
# df_5 = pd.read_html(url_5, attrs={'id': 'table'})
# df_5 = df_5[0]

In [23]:
df_5 = pd.read_csv("salary_tax.csv")
# Define a function to map country names to their ISO codes
def map_country_name(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return name
# Add a column to the dataframe with the ISO code of the country
df_5['ISO'] = df_5['Country'].apply(map_country_name)
df_5["Last"].astype(float)
df_5.columns = ["Country", "Income_tax", "ISO"]

In [24]:
df_5.to_csv("income_tax_updated.csv")

# Inflation rate

In [46]:
#Load inflation xls and add ISO countries
df_V3 = pd.read_csv("SQ_pred_database - 7 - Inflation.csv")
df_V3 

# Define a function to get the ISO name for each country
def get_iso_code(name):
    try:
        return pycountry.countries.search_fuzzy(name)[0].name
    except LookupError:
        return None
df_V3['ISO_name'] = df_V3['Country'].apply(get_iso_code)
df_V3

Unnamed: 0,ISO_code,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,ISO_name
0,ABW,Aruba,208,432,063,-237,042,048,-089,-047,358,426,122,074,604,Aruba
1,AFG,Afghanistan,218,118,644,739,467,-066,438,498,063,23,544,506,,Afghanistan
2,AGO,Angola,1448,1348,1028,878,73,916,3238,2984,1963,1708,2102,2385,2135,Angola
3,ALB,Albania,357,341,203,193,162,191,129,199,203,141,162,204,673,Albania
4,ARE,United Arab Emirates,088,088,066,109,234,407,162,197,306,-193,-208,018,522,United Arab Emirates
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,WSM,Samoa,-02,288,619,-021,-124,192,013,13,42,219,-157,-301,875,Samoa
189,YEM,"Yemen, Rep.",1117,1954,989,1097,81,394,5,304,276,10,2618,457,4385,
190,ZAF,South Africa,406,502,572,578,614,451,659,527,45,413,322,461,7,South Africa
191,ZMB,Zambia,85,644,659,699,782,1011,1786,658,749,98,1573,2202,1099,Zambia


In [47]:
df_V3.to_csv("inlfation_rate_ISO.csv")

# Get the categories by Airline : Passengers, business, cargo

In [109]:
from tqdm.contrib.concurrent import thread_map
pd.set_option('display.max_colwidth', None)

# GET AIRLINES URLS
url = 'https://www.pilotjobsnetwork.com/'
max_urls = 5

def get_airlines_urls(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')
    for link in links:
        href = link.get('href')
        if href and 'jobs/' in href:
            airline_url = url + href
            airline_name = link.text.strip()
            yield airline_url, airline_name # crée un générateur

# GET INFO FROM PAGES
def get_airline_tables(airline_tuple):
    airline_url, airline_name = airline_tuple
    try:
        response = requests.get(url)
        # Parse the HTML content of the response using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all text that contains the phrase "pilot jobs --->"
        matching_text = soup.find_all(text=lambda text: text and "pilot jobs --->" in text)
        # Regular expression to match text between parentheses
        for text in matching_text:
            paren_regex = r'\((.*?)\)'
            category = re.search(paren_regex, matching_text)
            if category:
                return [airline_url, airline_name] + category.group(1)
    
    except NameError:
        return airline_url

In [110]:
list_salaries = list(thread_map(get_airline_tables, get_airlines_urls(url)))
# print dans un excel le list_salaries

  matching_text = soup.find_all(text=lambda text: text and "pilot jobs --->" in text)
501it [01:10,  7.15it/s]


In [113]:
list_salaries

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [48]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://www.pilotjobsnetwork.com/operatorlist.php?reg=North+America'  # replace with actual URL
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.select_one('body > table > tbody > tr:nth-child(3) > td > table > tbody > tr > td > table > tbody > tr > td.carea > table > tbody > tr > td > table > tbody > tr:nth-child(2)')

df = pd.read_html(str(table))[0]


ValueError: No tables found

In [67]:
url_8 = 'https://www.pilotjobsnetwork.com/operatorlist.php?reg=North+America'

response = requests.get(url_8).content
soup = BeautifulSoup(response)
table_1 = soup.find_all("table")[0].content

df_8 = pd.read_html(table_1)
df_8

TypeError: cannot parse from 'NoneType'

In [87]:
url_8 = 'https://www.pilotjobsnetwork.com/operatorlist.php?reg=North+America'

response = requests.get(url_8).content
soup = BeautifulSoup(response, "html.parser")


table_1 = soup.body.table

table_1
# df_8 = pd.read_html(str(table_1))
# df_8

<table border="0" cellpadding="0" cellspacing="0" width="100%">
<tr>
<td width="100%">
<table border="0" cellpadding="0" cellspacing="0" width="100%">
<tr>
<td><img src="i/top1.jpg"/></td>
<td background="i/top1bg.jpg" width="100%"><img src="i/spacer.gif"/></td>
</tr>
</table>
</td>
</tr>
<tr>
<td background="i/top2bg.gif" width="100%">
<table border="0" cellpadding="0" cellspacing="0" width="100%">
<tr>
<td><img height="24" src="i/spacer.gif" width="40"/></td>
<td><a href="#"><img border="0" src="i/l1.jpg"/></a></td>
<td> </td>
<td><a href="#"><img border="0" src="i/l2.jpg"/></a></td>
<td> </td>
<td><a href="#"><img border="0" src="i/l3.jpg"/></a></td>
<td> </td>
<td><a href="#"><img border="0" src="i/l4.jpg"/></a></td>
<td> </td>
<td><a href="#"><img border="0" src="i/l5.jpg"/></a></td>
<td> </td>
<td width="100%"><img src="i/spacer.gif"/></td>
</tr>
</table>
</td>
</tr>
<tr>
<td height="395" width="100%">
<table border="0" cellpadding="0" cellspacing="0" width="100%">
<tr>
<td valig

In [93]:

# Send a GET request to the URL and get the HTML response
response = requests.get(url_8).content

# Use BeautifulSoup to parse the HTML response
soup = BeautifulSoup(response, 'html.parser')

# Find all <table> tags in the HTML and store them in a list
tables = soup.find_all('table')

# Loop through each table and convert it into a DataFrame using pandas
for i, table in enumerate(tables):
    df = pd.read_html(table)
    print(f'Table {i+1}:')
    print(df.head())

TypeError: 'NoneType' object is not callable

In [99]:
url_xx = "https://www.pilotjobsnetwork.com/jobs/Eurowings_GmbH"

import requests
from bs4 import BeautifulSoup

# URL of the webpage to scrape
url = url_xx

# Send a GET request to the URL and store the response
response = requests.get(url)

# Parse the HTML content of the response using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find all text that contains the phrase "pilot jobs --->"
matching_text = soup.find_all(text=lambda text: text and "pilot jobs --->" in text)

# Print the matching text
print(matching_text)


[' (Major/National/Low Cost) \n                                    pilot jobs ---> Germany']


  matching_text = soup.find_all(text=lambda text: text and "pilot jobs --->" in text)


In [108]:
import re

# Regular expression to match text between parentheses
paren_regex = r'\((.*?)\)'

# Loop through each matching text and extract the text between parentheses
for text in matching_text:
    # Use regex to find the text between parentheses
    match = re.search(paren_regex, text)
    
    # If a match is found, print the text between parentheses
    if match:
        print(match.group(1))

        print(match)
        print(matching_text)


Major/National/Low Cost
<re.Match object; span=(1, 26), match='(Major/National/Low Cost)'>
[' (Major/National/Low Cost) \n                                    pilot jobs ---> Germany']


# Main data

In [25]:
url_3 = "https://www.pilotjobsnetwork.com/jobs/Aegean_Airlines"
df_3 = pd.read_html(url_3, attrs={'class': 'col-2'})
df_3 = df_3[0]

In [26]:
df_3

Unnamed: 0,0,1,2
0,,,
1,Brief Payscale (please state whether before or...,Brief Payscale (please state whether before or...,last update
2,Capt top,5000/month net,16/Feb/15
3,Capt base,3200/month net,16/Feb/15
4,FO top,2100/Month Gross,19/Mar/18
5,FO base,1800,5/Dec/22
6,SO top,,
7,SO base,,
8,FE top,,
9,FE base,,
