In [275]:
import pandas as pd
pd.set_option("display.max_rows", 500)
import math
from google.cloud import bigquery
from google.oauth2 import service_account
import os
import warnings
warnings.filterwarnings(action="ignore")

In [276]:
# Read df_crawled_routes
df_crawled_routes = pd.read_excel("flight_data.xlsx").drop_duplicates()

# Read df_routes
df_routes = pd.read_excel("flight_routes.xlsx")
df_routes.columns = df_routes.columns.str.lower().str.replace(" ", "_")

# Read the airport data
df_airport_data = pd.read_excel("airport_data.xlsx")
# Add the latitude and longitude to the dataset
df_airport_data[["latitude", "longitude"]] = df_airport_data["Location"].str.split(",", 1, expand=True)

In [277]:
# Create a function to calculate the distance between two airports based on their latitude and longitude
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the distance between two points on the Earth's surface
    given their latitude and longitude coordinates.
    """
    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Haversine formula
    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    radius = 6371  # Radius of the Earth in kilometers
    distance = radius * c

    return round(distance, 2)

In [278]:
# Add the latitude and longitude fields of the *origin* airport
df_crawled_routes = pd.merge(
    left=df_crawled_routes,
    right=df_airport_data[["IataCode", "latitude", "longitude"]].drop_duplicates("IataCode"),
    left_on="origin_city_id",
    right_on="IataCode",
    how="left"
).drop(["IataCode"], axis=1)

# Add the latitude and longitude fields of the *destination* airport
df_crawled_routes = pd.merge(
    left=df_crawled_routes,
    right=df_airport_data[["IataCode", "latitude", "longitude"]].drop_duplicates("IataCode"),
    left_on="arrival_city_id",
    right_on="IataCode",
    how="left",
    suffixes=["_origin_city", "_arrival_city"]
).drop(["IataCode"], axis=1)

# Change the lat/long columns to type float
df_crawled_routes[["latitude_origin_city", "longitude_origin_city", "latitude_arrival_city", "longitude_arrival_city"]] = \
    df_crawled_routes[["latitude_origin_city", "longitude_origin_city", "latitude_arrival_city", "longitude_arrival_city"]].apply(lambda x: pd.to_numeric(x))

# Add the route competition status from df_routes
df_crawled_routes = pd.merge(
    left=df_crawled_routes,
    right=df_routes,
    left_on=["origin_city_search_term", "arrival_city_search_term"],
    right_on=["departure_city", "arrival_city"],
    how="left"
)

# Change all instances of Wizz Air {x} to just Wizz Air and replace any occurrence of a dot or space character in the "competitor" with an underscore. Finally, change all competitor names to lower case
df_crawled_routes["competitor"] = df_crawled_routes["competitor"].apply(lambda x: "Wizz" if "Wizz" in x else x).apply(lambda x: x.replace(".", "_").replace(" ", "_").lower())

In [279]:
# Add the distance field
df_crawled_routes["flight_distance_km"] = df_crawled_routes.\
    apply(lambda x: 
        haversine_distance(
            lat1=x["latitude_origin_city"],
            lon1=x["longitude_origin_city"],
            lat2=x["latitude_arrival_city"],
            lon2=x["longitude_arrival_city"]
        ), axis=1
    )
df_crawled_routes

Unnamed: 0,price_eur,origin_airport_name,origin_airport_display_code,arrival_airport_name,arrival_airport_display_code,flight_departure_time,flight_arrival_time,competitor,flight_duration,origin_city_search_term,...,arrival_city_id,crawling_date,latitude_origin_city,longitude_origin_city,latitude_arrival_city,longitude_arrival_city,departure_city,arrival_city,route_competition_status,flight_distance_km
0,213.10,Aalborg,AAL,Kaunas,KUN,2023-06-04T13:05:00,2023-06-04T17:55:00,dat,230,Aalborg,...,KUN,2023-06-04,57.093056,9.850000,54.963919,24.084778,Aalborg,Kaunas,Monopoly always,913.71
1,241.00,Aalborg,AAL,Kaunas,KUN,2023-06-04T13:20:00,2023-06-04T17:55:00,norwegian,215,Aalborg,...,KUN,2023-06-04,57.093056,9.850000,54.963919,24.084778,Aalborg,Kaunas,Monopoly always,913.71
2,228.32,Aalborg,AAL,Kaunas,KUN,2023-06-04T12:55:00,2023-06-04T17:55:00,scandinavian_airlines,240,Aalborg,...,KUN,2023-06-04,57.093056,9.850000,54.963919,24.084778,Aalborg,Kaunas,Monopoly always,913.71
3,152.83,Aalborg,AAL,Kaunas,KUN,2023-06-04T09:05:00,2023-06-04T17:55:00,norwegian,470,Aalborg,...,KUN,2023-06-04,57.093056,9.850000,54.963919,24.084778,Aalborg,Kaunas,Monopoly always,913.71
4,205.95,Aalborg,AAL,Kaunas,KUN,2023-06-04T09:40:00,2023-06-04T17:55:00,scandinavian_airlines,435,Aalborg,...,KUN,2023-06-04,57.093056,9.850000,54.963919,24.084778,Aalborg,Kaunas,Monopoly always,913.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172352,626.00,Zaragoza,ZAZ,Paris Beauvais,BVA,2023-06-04T18:10:00,2023-06-05T10:05:00,vueling_airlines,955,Zaragoza,...,PARI,2023-06-04,41.663423,-1.011086,48.856622,2.342876,Zaragoza,Paris (FR),1 comp No Wizz,841.60
172353,585.00,Zaragoza,ZAZ,Paris Beauvais,BVA,2023-06-04T18:10:00,2023-06-05T11:15:00,vueling_airlines,1025,Zaragoza,...,PARI,2023-06-04,41.663423,-1.011086,48.856622,2.342876,Zaragoza,Paris (FR),1 comp No Wizz,841.60
172354,616.00,Zaragoza,ZAZ,Paris Orly,ORY,2023-06-04T13:20:00,2023-06-05T07:20:00,ryanair,1080,Zaragoza,...,PARI,2023-06-04,41.663423,-1.011086,48.856622,2.342876,Zaragoza,Paris (FR),1 comp No Wizz,841.60
172355,500.70,Zaragoza,ZAZ,Paris Orly,ORY,2023-06-04T18:10:00,2023-06-05T17:55:00,vueling_airlines,1425,Zaragoza,...,PARI,2023-06-04,41.663423,-1.011086,48.856622,2.342876,Zaragoza,Paris (FR),1 comp No Wizz,841.60


In [280]:
# Choose the relevant columns from the data frame and melt it
relevant_cols = [
    "origin_city_search_term", "arrival_city_search_term", "route_competition_status",
    "origin_city_id", "latitude_origin_city", "longitude_origin_city", "origin_airport_name", "origin_airport_display_code",
    "arrival_city_id", "latitude_arrival_city", "longitude_arrival_city",  "arrival_airport_name", "arrival_airport_display_code",
    "competitor", "price_eur", "flight_duration", "flight_distance_km", "crawling_date"
]

# Sort the dataset by "crawling_date", "origin_city_search_term", "arrival_city_search_term", "competitor", "price_eur
sort_cols = ["crawling_date", "origin_city_search_term", "arrival_city_search_term", "competitor", "price_eur", "flight_duration"]
df_crawled_routes = df_crawled_routes[relevant_cols].sort_values(sort_cols).drop_duplicates().reset_index(drop=True)

# Pick the cheapest price_eur in each partition
# We don't include the airports, so we can get one price_eur per competitor
partition_1_cols = [
    "crawling_date", "origin_city_search_term", "origin_city_id", "latitude_origin_city", "longitude_origin_city",
    "arrival_city_search_term", "arrival_city_id", "latitude_arrival_city", "longitude_arrival_city",
    "route_competition_status", "competitor",
]
df_min_price_per_comp_route = df_crawled_routes.groupby(partition_1_cols, as_index=False)["price_eur"].min()

# Inner join df_min_price_per_comp_route and df_final to eliminate the more expensive flights
df_final = pd.merge(
    left=df_min_price_per_comp_route,
    right=df_crawled_routes[["origin_city_search_term", "arrival_city_search_term", "flight_distance_km"]].drop_duplicates(),
    on=["origin_city_search_term", "arrival_city_search_term"],
    how="inner"
)
df_final

Unnamed: 0,crawling_date,origin_city_search_term,origin_city_id,latitude_origin_city,longitude_origin_city,arrival_city_search_term,arrival_city_id,latitude_arrival_city,longitude_arrival_city,route_competition_status,competitor,price_eur,flight_distance_km
0,2023-06-04,Aalborg,AAL,57.093056,9.850000,Kaunas,KUN,54.963919,24.084778,Monopoly always,dat,139.79,913.71
1,2023-06-04,Aalborg,AAL,57.093056,9.850000,Kaunas,KUN,54.963919,24.084778,Monopoly always,klm,310.00,913.71
2,2023-06-04,Aalborg,AAL,57.093056,9.850000,Kaunas,KUN,54.963919,24.084778,Monopoly always,norwegian,113.94,913.71
3,2023-06-04,Aalborg,AAL,57.093056,9.850000,Kaunas,KUN,54.963919,24.084778,Monopoly always,ryanair,385.82,913.71
4,2023-06-04,Aalborg,AAL,57.093056,9.850000,Kaunas,KUN,54.963919,24.084778,Monopoly always,scandinavian_airlines,165.88,913.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39191,2023-06-04,Zaragoza,ZAZ,41.663423,-1.011086,Santiago de Compostela,SCQ,42.900000,-8.416800,Monopoly always,vueling_airlines,477.53,624.34
39192,2023-06-04,Zaragoza,ZAZ,41.663423,-1.011086,Venice (IT),VENI,45.466510,12.345653,1 comp No Wizz,ryanair,107.00,1154.51
39193,2023-06-04,Zaragoza,ZAZ,41.663423,-1.011086,Venice (IT),VENI,45.466510,12.345653,1 comp No Wizz,volotea,74.00,1154.51
39194,2023-06-04,Zaragoza,ZAZ,41.663423,-1.011086,Venice (IT),VENI,45.466510,12.345653,1 comp No Wizz,vueling_airlines,484.00,1154.51


In [281]:
# Change the data frame so that each competitor's price is shown as one column
pivot_table_index_cols = [
    "crawling_date", "origin_city_search_term", "origin_city_id", "latitude_origin_city", "longitude_origin_city",
    "arrival_city_search_term", "arrival_city_id", "latitude_arrival_city", "longitude_arrival_city", "route_competition_status", "flight_distance_km"
] 
df_final_pivot = pd.pivot_table(
    data=df_final,
    values="price_eur",
    index=pivot_table_index_cols,
    columns=["competitor"]
).reset_index()

# Display Ryan Air and Wizz as the first two competitors
all_df_final_pivot_cols = df_final_pivot.columns.tolist()
first_cols = pivot_table_index_cols + ["ryanair", "wizz"]
exclusion_list = list(set(all_df_final_pivot_cols) - set(first_cols))
rearranged_col_list = first_cols + exclusion_list 
df_final_pivot = df_final_pivot[rearranged_col_list]
# Remove the name "competitor" from the index
df_final_pivot.rename_axis(None, inplace=True, axis=1)

# Display the final table
df_final_pivot

Unnamed: 0,crawling_date,origin_city_search_term,origin_city_id,latitude_origin_city,longitude_origin_city,arrival_city_search_term,arrival_city_id,latitude_arrival_city,longitude_arrival_city,route_competition_status,...,transavia,czech_airlines,aer_lingus,icelandair,air_india,bulgarian_air_charter,blue_islands,china_eastern,ethiopian_airlines,citizenplane
0,2023-06-04,Aalborg,AAL,57.093056,9.850000,Kaunas,KUN,54.963919,24.084778,Monopoly always,...,,,,,,,,,,
1,2023-06-04,Aalborg,AAL,57.093056,9.850000,London (GB),LOND,51.504117,-0.094347,Monopoly not always,...,,,,,,,,,,
2,2023-06-04,Aalborg,AAL,57.093056,9.850000,Stockholm,STOC,59.329474,18.062640,Monopoly always,...,,,,,,,,,,
3,2023-06-04,Aarhus,AAR,56.310278,10.618056,Faro (PT),FAO,46.915556,-96.814722,Monopoly always,...,,,,,,,,,,
4,2023-06-04,Aarhus,AAR,56.310278,10.618056,Gdansk,GDN,54.376558,18.470078,Monopoly always,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5099,2023-06-04,Zaragoza,ZAZ,41.663423,-1.011086,Palma de Mallorca,PMI,18.729167,-91.656944,2+ comps exc Wizz,...,,,,,,,,,,
5100,2023-06-04,Zaragoza,ZAZ,41.663423,-1.011086,Paris (FR),PARI,48.856622,2.342876,1 comp No Wizz,...,,,,,,,,,,
5101,2023-06-04,Zaragoza,ZAZ,41.663423,-1.011086,Santiago de Compostela,SCQ,42.900000,-8.416800,Monopoly always,...,,,,,,,,,,
5102,2023-06-04,Zaragoza,ZAZ,41.663423,-1.011086,Venice (IT),VENI,45.466510,12.345653,1 comp No Wizz,...,,,,,,,,,,


In [282]:
# Upload the data to BigQuery
credentials = service_account.Credentials.from_service_account_file(
    filename=os.path.expanduser("~") + "/bq_credentials.json", scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
client = bigquery.Client(project="web-scraping-371310", credentials=credentials)
job_config_raw = bigquery.LoadJobConfig()
job_config_raw.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

job_config_pivot = bigquery.LoadJobConfig()
job_config_pivot.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

# Upload the raw data
client.load_table_from_dataframe(
    dataframe=df_crawled_routes,
    destination="web-scraping-371310.crawled_datasets.benoit_flight_route_data_raw",
    job_config=job_config_pivot
).result()

# Upload the pivoted table
client.load_table_from_dataframe(
    dataframe=df_final_pivot,
    destination="web-scraping-371310.crawled_datasets.benoit_flight_route_data_pivoted",
    job_config=job_config_pivot
).result()

LoadJob<project=web-scraping-371310, location=EU, id=40a8d253-e5ab-48a0-95d6-eaa62e972e15>