In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows", 500)
import math
from google.cloud import bigquery
from google.oauth2 import service_account
import os
import warnings
warnings.filterwarnings(action="ignore")



In [13]:
# Read df_crawled_routes
df_crawled_routes = pd.read_excel("flight_data_t_plus_8.xlsx").drop_duplicates()

# Read df_routes
df_routes = pd.read_excel("flight_routes.xlsx")
df_routes.columns = df_routes.columns.str.lower().str.replace(" ", "_")

# Read the airport data
df_airport_data = pd.read_excel("airport_data.xlsx")
# Add the latitude and longitude to the dataset
df_airport_data[["latitude", "longitude"]] = df_airport_data["Location"].str.split(",", 1, expand=True)

In [14]:
# Create a function to calculate the distance between two airports based on their latitude and longitude
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the distance between two points on the Earth's surface
    given their latitude and longitude coordinates.
    """
    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Haversine formula
    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    radius = 6371  # Radius of the Earth in kilometers
    distance = radius * c

    return round(distance, 2)

In [15]:
# Add the latitude and longitude fields of the *origin* airport
df_crawled_routes = pd.merge(
    left=df_crawled_routes,
    right=df_airport_data[["IataCode", "latitude", "longitude"]].drop_duplicates("IataCode"),
    left_on="origin_city_id",
    right_on="IataCode",
    how="left"
).drop(["IataCode"], axis=1)

# Add the latitude and longitude fields of the *destination* airport
df_crawled_routes = pd.merge(
    left=df_crawled_routes,
    right=df_airport_data[["IataCode", "latitude", "longitude"]].drop_duplicates("IataCode"),
    left_on="arrival_city_id",
    right_on="IataCode",
    how="left",
    suffixes=["_origin_city", "_arrival_city"]
).drop(["IataCode"], axis=1)

# Change the lat/long columns to type float
df_crawled_routes[["latitude_origin_city", "longitude_origin_city", "latitude_arrival_city", "longitude_arrival_city"]] = \
    df_crawled_routes[["latitude_origin_city", "longitude_origin_city", "latitude_arrival_city", "longitude_arrival_city"]].apply(lambda x: pd.to_numeric(x))

# Add the route competition status from df_routes
df_crawled_routes = pd.merge(
    left=df_crawled_routes,
    right=df_routes,
    left_on=["origin_city_search_term", "arrival_city_search_term"],
    right_on=["departure_city", "arrival_city"],
    how="left"
)

# Change all instances of Wizz Air {x} to just Wizz Air and replace any occurrence of a dot or space character in the "competitor" with an underscore. Finally, change all competitor names to lower case
df_crawled_routes["competitor"] = df_crawled_routes["competitor"].apply(lambda x: "Wizz" if "Wizz" in x else x).apply(lambda x: x.replace(".", "_").replace(" ", "_").lower())

In [16]:
# Add the distance field
df_crawled_routes["flight_distance_km"] = df_crawled_routes.\
    apply(lambda x: 
        haversine_distance(
            lat1=x["latitude_origin_city"],
            lon1=x["longitude_origin_city"],
            lat2=x["latitude_arrival_city"],
            lon2=x["longitude_arrival_city"]
        ), axis=1
    )

# Filter for crawled_routes with stop_count = 0
df_crawled_routes_stop_count_0 = df_crawled_routes[df_crawled_routes["stop_count"]==0]

In [17]:
# Choose the relevant columns from the data frame and melt it
relevant_cols = [
    "origin_city_search_term", "arrival_city_search_term", "route_competition_status",
    "origin_city_id", "latitude_origin_city", "longitude_origin_city", "origin_airport_name", "origin_airport_display_code",
    "arrival_city_id", "latitude_arrival_city", "longitude_arrival_city",  "arrival_airport_name", "arrival_airport_display_code",
    "stop_count", "competitor", "price_eur", "flight_duration", "flight_distance_km", "crawling_date"
]

# Sort the dataset by "crawling_date", "origin_city_search_term", "arrival_city_search_term", "competitor", "price_eur
sort_cols = ["crawling_date", "origin_city_search_term", "arrival_city_search_term", "competitor", "price_eur", "flight_duration"]
df_crawled_routes_stop_count_0 = df_crawled_routes_stop_count_0[relevant_cols].sort_values(sort_cols).drop_duplicates().reset_index(drop=True)

# Pick the cheapest price_eur in each partition
# We don't include the airports, so we can get one price_eur per competitor
partition_1_cols = [
    "crawling_date", "origin_city_search_term", "origin_city_id", "latitude_origin_city", "longitude_origin_city",
    "arrival_city_search_term", "arrival_city_id", "latitude_arrival_city", "longitude_arrival_city",
    "route_competition_status", "competitor", "stop_count"
]
df_crawled_routes_stop_count_0["min_price_flag"] = df_crawled_routes_stop_count_0.groupby(partition_1_cols, as_index=False)["price_eur"].rank(method="first", ascending=True)
df_min_price_per_comp_route = df_crawled_routes_stop_count_0[df_crawled_routes_stop_count_0["min_price_flag"] == 1]

# Inner join df_min_price_per_comp_route_stop_count and df_final to eliminate the more expensive flights
df_final = pd.merge(
    left=df_min_price_per_comp_route,
    right=df_crawled_routes_stop_count_0[["origin_city_search_term", "arrival_city_search_term"]].drop_duplicates(),
    on=["origin_city_search_term", "arrival_city_search_term"],
    how="inner"
)

df_final

Unnamed: 0,origin_city_search_term,arrival_city_search_term,route_competition_status,origin_city_id,latitude_origin_city,longitude_origin_city,origin_airport_name,origin_airport_display_code,arrival_city_id,latitude_arrival_city,longitude_arrival_city,arrival_airport_name,arrival_airport_display_code,stop_count,competitor,price_eur,flight_duration,flight_distance_km,crawling_date,min_price_flag
0,Aalborg,Kaunas,Monopoly always,AAL,57.093056,9.850000,Aalborg,AAL,KUN,54.963919,24.084778,Kaunas,KUN,0,ryanair,12.00,100,913.71,2023-06-19,1.0
1,Aalborg,London (GB),Monopoly not always,AAL,57.093056,9.850000,Aalborg,AAL,LOND,51.504117,-0.094347,London Stansted,STN,0,ryanair,12.67,110,894.24,2023-06-19,1.0
2,Aarhus,Faro (PT),Monopoly always,AAR,56.310278,10.618056,Aarhus,AAR,FAO,46.915556,-96.814722,Faro,FAO,0,ryanair,16.80,230,6714.46,2023-06-19,1.0
3,Aarhus,Gdansk,Monopoly always,AAR,56.310278,10.618056,Aarhus,AAR,GDN,54.376558,18.470078,Gdansk,GDN,0,ryanair,19.20,70,540.66,2023-06-19,1.0
4,Aarhus,London (GB),Monopoly not always,AAR,56.310278,10.618056,Aarhus,AAR,LOND,51.504117,-0.094347,London Stansted,STN,0,ryanair,19.38,105,880.42,2023-06-19,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12646,Liverpool,Ibiza,Monopoly not always,LPL,53.335831,-2.855101,Liverpool,LPL,IBZ,38.873611,1.372778,Ibiza,IBZ,0,jet2,130.45,175,1640.09,2024-05-13,1.0
12647,Milan,Lanzarote,2+ comps exc Wizz,MILA,45.468394,9.173017,Milan Linate,LIN,ACE,28.950250,-13.605745,Lamezia Terme,SUF,0,ita_airways,51.75,100,2710.32,2024-05-13,1.0
12648,Newcastle (GB),Fuerteventura,1 comp No Wizz,NCL,55.037778,-1.686667,Newcastle,NCL,FUE,28.452717,-13.863761,Faro,FAO,0,jet2,195.90,200,3113.63,2024-05-13,1.0
12649,Sofia,Bristol (GB),1 comp No Wizz,SOF,42.693412,23.406932,Skelleftea,SFT,BRS,51.383688,-2.713563,Stockholm Arlanda,ARN,0,scandinavian_airlines,60.65,70,2186.74,2024-05-13,1.0


In [18]:
# Change the data frame so that each competitor's price is shown as one column
pivot_table_index_cols = [
    "crawling_date", "origin_city_search_term", "origin_city_id", "latitude_origin_city", "longitude_origin_city",
    "arrival_city_search_term", "arrival_city_id", "latitude_arrival_city", "longitude_arrival_city", "route_competition_status", "flight_distance_km", "stop_count"
] 
df_final_pivot = pd.pivot_table(
    data=df_final,
    values="price_eur",
    index=pivot_table_index_cols,
    columns=["competitor"]
).reset_index()

# Display Ryan Air and Wizz as the first two competitors
all_df_final_pivot_cols = df_final_pivot.columns.tolist()
first_cols = pivot_table_index_cols + ["ryanair", "wizz"]
exclusion_list = list(set(all_df_final_pivot_cols) - set(first_cols))
rearranged_col_list = first_cols + exclusion_list 
df_final_pivot = df_final_pivot[rearranged_col_list]
# Remove the name "competitor" from the index
df_final_pivot.rename_axis(None, inplace=True, axis=1)

# Display the final table
df_final_pivot

Unnamed: 0,crawling_date,origin_city_search_term,origin_city_id,latitude_origin_city,longitude_origin_city,arrival_city_search_term,arrival_city_id,latitude_arrival_city,longitude_arrival_city,route_competition_status,...,eastern_airways,iberia,iberia_express,aer_lingus,sky_express,twin_jet,azman_air_services,easyjet_europe,tarom,ethiopian_airlines
0,2023-06-19,Aalborg,AAL,57.093056,9.850000,Kaunas,KUN,54.963919,24.084778,Monopoly always,...,,,,,,,,,,
1,2023-06-19,Aalborg,AAL,57.093056,9.850000,London (GB),LOND,51.504117,-0.094347,Monopoly not always,...,,,,,,,,,,
2,2023-06-19,Aarhus,AAR,56.310278,10.618056,Faro (PT),FAO,46.915556,-96.814722,Monopoly always,...,,,,,,,,,,
3,2023-06-19,Aarhus,AAR,56.310278,10.618056,Gdansk,GDN,54.376558,18.470078,Monopoly always,...,,,,,,,,,,
4,2023-06-19,Aarhus,AAR,56.310278,10.618056,London (GB),LOND,51.504117,-0.094347,Monopoly not always,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7226,2024-05-13,Zadar,ZAD,44.097778,15.356667,Frankfurt,FRAN,50.117272,8.644397,1 comp No Wizz,...,,,,,,,,,,
7227,2024-05-13,Zadar,ZAD,44.097778,15.356667,Vienna,VIE,48.122100,16.557510,1 comp No Wizz,...,,,,,,,,,,
7228,2024-05-13,Zagreb,ZAG,45.740863,16.067501,Frankfurt,FRAN,50.117272,8.644397,1 comp No Wizz,...,,,,,,,,,,
7229,2024-05-13,Zagreb,ZAG,45.740863,16.067501,Paris (FR),PARI,48.856622,2.342876,2+ comps exc Wizz,...,,,,,,,,,,


In [19]:
# Append the rest of the routes that did not have direct flights to df_final_pivot

# First create a data frame containing the indirect routes
df_indirect_routes = pd.merge(
    left=df_crawled_routes[pivot_table_index_cols[0:-1]].drop_duplicates().reset_index(drop=True),
    right=df_crawled_routes_stop_count_0[pivot_table_index_cols].drop_duplicates().reset_index(drop=True),
    on=pivot_table_index_cols[0:-1],
    how="left"
)
df_indirect_routes = df_indirect_routes[df_indirect_routes["stop_count"].isnull()].reset_index(drop=True)
df_indirect_routes.fillna(value="> 0 stops", inplace=True)

# Next, create df_final_pivot 
df_final_pivot_full = pd.concat(
    [df_final_pivot, df_indirect_routes], axis=0
)

# Convert stop_count to string
df_final_pivot_full["stop_count"] = df_final_pivot_full["stop_count"].apply(lambda x: str(x))

# Add a partition field to allow for easier filtering
def partition_func(df, ryanair_col, stop_count_col):
    if ~np.isnan(df[ryanair_col]) and df[stop_count_col] == "0":
        return "partition_1"
    elif np.isnan(df[ryanair_col]) and df[stop_count_col] == "0":
        return "partition_2"
    elif df[stop_count_col] == "> 0 stops":
        return "partition_3"

df_final_pivot_full["data_partition"] = df_final_pivot_full.apply(partition_func, axis=1, ryanair_col="ryanair", stop_count_col="stop_count")

In [22]:
# Upload the data to BigQuery
credentials = service_account.Credentials.from_service_account_file(
    filename=os.path.expanduser("~") + "/bq_credentials.json", scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
client = bigquery.Client(project="web-scraping-371310", credentials=credentials)
job_config_raw = bigquery.LoadJobConfig(
    schema=[

    ]
)
job_config_raw.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

job_config_pivot = bigquery.LoadJobConfig()
job_config_pivot.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

# Upload the raw data
client.load_table_from_dataframe(
    dataframe=df_crawled_routes,
    destination="web-scraping-371310.crawled_datasets.benoit_flight_route_data_raw",
    job_config=job_config_pivot
).result()

# Upload the pivoted table
client.load_table_from_dataframe(
    dataframe=df_final_pivot_full,
    destination="web-scraping-371310.crawled_datasets.benoit_flight_route_data_pivoted",
    job_config=job_config_pivot
).result()


LoadJob<project=web-scraping-371310, location=EU, id=abedfea6-5863-4879-9066-766bef61c4fc>