### For this project, the flight price dataset is available from specific US and Canada airports to specific US airports.
### The data is available from March 2025 to 10th February, 2026. 
### The data is retrieved using some of the API endpoints of Goflightlab.

# Data cleaning strategy
### Save the selected airport's details, extracted from the airport_data file into a csv, after cleaning the dataset.
### Save the required airlines information, extracted from the airlines_list file into a csv, after cleaning the dataset.
### Save the flight price dataset in a csv, after necessary cleaning.

In [1]:
# import dependencies
# Import the pandas library for data manipulation and analysis
import pandas as pd
# Import the glob module to find all file paths matching a pattern
import glob

# 1. Import Canada and US airports dataset. 


### Concatenate Canada and US airports information. From the concatenated dataset, retrieve only the required airports. They are
#### CANADAN AIRPORTS  --- 1.Toronto Island(YTZ)      2.Ottawa International(YOW)       3.Toronto Pearson International(YYZ)                       
#### 4.Montreal Pierre Elliott Trudeau (YUL)                                                                        
#### US AIRPORTS --- 1.Dallas Fort Worth International(DFW)  2.Denver International(DEN)   3.Atlanta Hartsfield-Jackson(ATL)  4.Chicago O'Hare International(ORD)
#### 5.Los Angeles International(LAX)

In [2]:
# Read in the Canada airport data
canada_airport_data = pd.read_csv("Resources/raw_data/airport_data/airport_data_CA.csv")
canada_airport_data.head()

Unnamed: 0,Name,IATA Code,ICAO Code,Country,City,City Code,Latitude,Longitude,Timezone,Departures
0,Abbotsford International Airport,YXX,CYXX,CA,Abbotsford,YVR,49.02529,-122.37735,America/Vancouver,941.0
1,Aklavik/Freddie Carmichael Airport,LAK,CYKD,CA,Aklavik,LAK,68.22333,-135.00583,America/Yellowknife,348.0
2,Akulivik Airport,AKV,CYKO,CA,Akulivik,AKV,60.81861,-78.14861,America/Toronto,270.0
3,Alberni Valley Regional Airport,YPB,,CA,Port Alberni,YPB,49.31933,-124.92979,America/Vancouver,
4,Alert Bay Airport,YAL,CYAL,CA,Alert Bay,,50.5822,-126.916,America/Vancouver,


In [3]:
# Read in the US airport data
us_airport_data = pd.read_csv("Resources/raw_data/airport_data/airport_data_US.csv")
us_airport_data.head()

Unnamed: 0,Name,IATA Code,ICAO Code,Country,City,City Code,Latitude,Longitude,Timezone,Departures
0,A L Mangham Jr. Regional Airport,OCH,KOCH,US,Nacogdoches,OCH,31.57788,-94.70668,America/Chicago,
1,A P Hill Army Airfield (Fort A P Hill),APH,KAPH,US,Bowling Green,,38.06968,-77.31809,America/New_York,
2,Aban Uqua Airportse,QKA,KQKA,US,KXXQO City,,-14.735092,-91.112751,Etc/GMT+6,
3,Abbeville Chris Crusta Memorial Airport,,KIYA,US,New Iberia,ARA,29.97161,-92.0847,America/Chicago,
4,Aberdeen Regional Airport,ABR,KABR,US,Aberdeen,ABR,45.4497,-98.42148,America/Chicago,374.0


In [4]:
# concat canada and us airport data
merged_canada_us_airport_data = pd.concat([canada_airport_data, us_airport_data])
merged_canada_us_airport_data.head()

Unnamed: 0,Name,IATA Code,ICAO Code,Country,City,City Code,Latitude,Longitude,Timezone,Departures
0,Abbotsford International Airport,YXX,CYXX,CA,Abbotsford,YVR,49.02529,-122.37735,America/Vancouver,941.0
1,Aklavik/Freddie Carmichael Airport,LAK,CYKD,CA,Aklavik,LAK,68.22333,-135.00583,America/Yellowknife,348.0
2,Akulivik Airport,AKV,CYKO,CA,Akulivik,AKV,60.81861,-78.14861,America/Toronto,270.0
3,Alberni Valley Regional Airport,YPB,,CA,Port Alberni,YPB,49.31933,-124.92979,America/Vancouver,
4,Alert Bay Airport,YAL,CYAL,CA,Alert Bay,,50.5822,-126.916,America/Vancouver,


##### Filtering the merged airport data to get the required Canadian and US airport

In [5]:
# Filter the merged Canada-US airport dataset to include only the busiest airports based on their IATA codes.
can_us_busiest_airport = merged_canada_us_airport_data.loc[merged_canada_us_airport_data['IATA Code'].isin(["YUL", "YYZ", "YTZ", "YOW", "YKF", "ATL", "DFW", "DEN", "ORD","LAX"])]

#Reset the index of the filtered DataFrame to ensure it starts from 0 and remove the old index
filtered_airports = can_us_busiest_airport.reset_index(drop=True) 

# Display the filtered DataFrame
filtered_airports


Unnamed: 0,Name,IATA Code,ICAO Code,Country,City,City Code,Latitude,Longitude,Timezone,Departures
0,Billy Bishop Toronto City Airport,YTZ,CYTZ,CA,Toronto,YTO,43.62974,-79.39828,America/Toronto,752.0
1,Kitchener/Waterloo Airport,YKF,CYKF,CA,Breslau,YKF,43.45747,-80.38593,America/Toronto,454.0
2,Montreal-Pierre Elliott Trudeau International ...,YUL,CYUL,CA,Montreal,YMQ,45.46106,-73.75019,America/Toronto,35190.0
3,Ottawa Macdonald-Cartier International Airport,YOW,CYOW,CA,Ottawa,YOW,45.3225,-75.66917,America/Toronto,10293.0
4,Toronto Pearson International Airport,YYZ,CYYZ,CA,Toronto,YTO,43.68066,-79.61286,America/Toronto,72355.0
5,Chicago O'Hare International Airport,ORD,KORD,US,Chicago,CHI,41.97959,-87.90446,America/Chicago,177167.0
6,Dallas/Fort Worth International Airport,DFW,KDFW,US,Dallas-Ft Worth,DFW,32.89595,-97.0372,America/Chicago,172447.0
7,Denver International Airport,DEN,KDEN,US,Denver,DEN,39.85891,-104.67326,America/Denver,100565.0
8,Hartsfield-Jackson Atlanta International Airport,ATL,KATL,US,Atlanta,ATL,33.64099,-84.42265,America/New_York,168164.0
9,Los Angeles International Airport,LAX,KLAX,US,Los Angeles,LAX,33.94251,-118.40897,America/Los_Angeles,112571.0


##### Dropping unwanted columns

In [6]:
# Define a list of columns to drop from the DataFrame
column_to_drop = ["ICAO Code","City Code"]

# Drop the specified columns from the filtered_airports DataFrame. The "errors='ignore'" parameter ensures that if the columns are not found, no error is raised
filtered_airports = filtered_airports.drop(columns=column_to_drop, errors="ignore")

# Reset the index of the DataFrame to maintain sequential ordering. The "inplace=True" parameter modifies the DataFrame directly without creating a copy
filtered_airports.reset_index(drop=True, inplace=True)
filtered_airports

Unnamed: 0,Name,IATA Code,Country,City,Latitude,Longitude,Timezone,Departures
0,Billy Bishop Toronto City Airport,YTZ,CA,Toronto,43.62974,-79.39828,America/Toronto,752.0
1,Kitchener/Waterloo Airport,YKF,CA,Breslau,43.45747,-80.38593,America/Toronto,454.0
2,Montreal-Pierre Elliott Trudeau International ...,YUL,CA,Montreal,45.46106,-73.75019,America/Toronto,35190.0
3,Ottawa Macdonald-Cartier International Airport,YOW,CA,Ottawa,45.3225,-75.66917,America/Toronto,10293.0
4,Toronto Pearson International Airport,YYZ,CA,Toronto,43.68066,-79.61286,America/Toronto,72355.0
5,Chicago O'Hare International Airport,ORD,US,Chicago,41.97959,-87.90446,America/Chicago,177167.0
6,Dallas/Fort Worth International Airport,DFW,US,Dallas-Ft Worth,32.89595,-97.0372,America/Chicago,172447.0
7,Denver International Airport,DEN,US,Denver,39.85891,-104.67326,America/Denver,100565.0
8,Hartsfield-Jackson Atlanta International Airport,ATL,US,Atlanta,33.64099,-84.42265,America/New_York,168164.0
9,Los Angeles International Airport,LAX,US,Los Angeles,33.94251,-118.40897,America/Los_Angeles,112571.0


##### Update TimeZone column

In [7]:
# Replace specific timezone values in the 'Timezone' column with corresponding standard time zone abbreviations
filtered_airports['Timezone'] = filtered_airports['Timezone'].replace({'America/Toronto': 'EST',
                                                                       'America/Chicago': 'CST',
                                                                       'America/Denver': 'MST',
                                                                       'America/New_York':'EST', 
                                                                        'America/Los_Angeles':'PST'                                                                           
                                                                       })

##### Save the cleaned airportas extracted_airports.csv

In [8]:
filtered_airports.to_csv("extracted_airports.csv", index=False)

# 2. Import flight price dataset 
##### Loop through each month's flight price data to get a year  dataset.
##### Convert the departure_time and arrival_time fields to datetime format. 
##### Split the departure_time to Departure Date and Departure Time and  arrival_time to Arrival Date and Arrival Time.
##### Drop unwanted columns.
##### Retrieve only the required airports dataset.
##### Confirm that only the required airports data is retrieved.
##### Find the unique airline names from marketing_airline and operating_airline columns.


In [9]:
# Define the pattern to match all the flight price CSV files
file_pattern = "Resources/raw_data/flight_price_data/flight_data_full_*.csv"

In [10]:
# Get a list of matching CSV file names
csv_files = glob.glob(file_pattern)
csv_files

['Resources/raw_data/flight_price_data\\flight_data_full_April.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_August.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_December.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_February2026.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_January2026.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_July.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_June.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_March.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_May.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_November.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_October.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_September.csv']

In [11]:

# Create an empty list to store dataframes
flight_price_full_year = []
# Loop through each file and read it into a dataframe
for file in csv_files:
    # Read CSV
    flight_price_df = pd.read_csv(file)  

    # Append dataframe to list
    flight_price_full_year.append(flight_price_df) 

In [12]:
# Combine all dataframes in the list into a single DataFrame
flight_price_full_year_df = pd.concat(flight_price_full_year, ignore_index=True)
flight_price_full_year_df

Unnamed: 0,date,itinerary_id,cabin_class,sort_by,price_raw,price_formatted,currency,flight_number,origin_airport,origin_city,...,arrival_time,duration_minutes,stop_count,marketing_airline,operating_airline,change_allowed,cancellation_allowed,is_self_transfer,has_flexible_options,score
0,2025-04-01,18467-2504011020--31679-1-10968-2504011436,economy,fastest,476.60,C$477,,5545,Toronto Pearson International,,...,2025-04-01T11:39:00,79,1,WestJet,SkyWest DBA Delta Connection,False,False,False,False,0.999000
1,2025-04-01,18467-2504011020--31679-1-10968-2504011436,economy,fastest,476.60,C$477,,8400,Detroit Wayne County,,...,2025-04-01T14:36:00,176,1,WestJet,Delta,False,False,False,False,0.999000
2,2025-04-01,18467-2504010600--31679-1-10968-2504011037,economy,fastest,476.60,C$477,,6342,Toronto Pearson International,,...,2025-04-01T07:32:00,92,1,WestJet,Endeavor Air DBA Delta Connection,False,False,False,False,0.611641
3,2025-04-01,18467-2504010600--31679-1-10968-2504011037,economy,fastest,476.60,C$477,,7021,Detroit Wayne County,,...,2025-04-01T10:37:00,186,1,WestJet,Delta,False,False,False,False,0.611641
4,2025-04-01,18467-2504011320--31679-1-10968-2504011758,economy,fastest,476.60,C$477,,6343,Toronto Pearson International,,...,2025-04-01T14:36:00,76,1,WestJet,Endeavor Air DBA Delta Connection,False,False,False,False,0.627755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22124,2025-09-30,18467-2509301540--32573-1-10968-2509302150,first,fastest,581.49,C$582,,3108,Charlotte Douglas,,...,2025-09-30T21:50:00,155,1,American Airlines,American Airlines,False,False,False,False,0.352783
22125,2025-09-30,18467-2509300725--32573-1-10968-2509301342,first,fastest,581.49,C$582,,4559,Toronto Pearson International,,...,2025-09-30T09:10:00,105,1,American Airlines,Republic Airways AS American Eagle,False,False,False,False,0.380391
22126,2025-09-30,18467-2509300725--32573-1-10968-2509301342,first,fastest,581.49,C$582,,2697,New York John F. Kennedy,,...,2025-09-30T13:42:00,232,1,American Airlines,American Airlines,False,False,False,False,0.380391
22127,2025-09-30,18467-2509301130--32573-1-10968-2509301841,first,fastest,581.49,C$582,,3604,Toronto Pearson International,,...,2025-09-30T12:25:00,115,1,American Airlines,Envoy Air As American Eagle,False,False,False,False,0.362377


##### Splitting the departure_time and arrival_time columns into two columns departure_date/arrival_date and departure_time/arrival_time respectively.

In [13]:
# Convert arrival_time to datetime format
flight_price_full_year_df['departure_time'] = pd.to_datetime(flight_price_full_year_df['departure_time'])
    # Extract date and time into separate columns arrival_date and arrival_time
flight_price_full_year_df['Departure Date'] = flight_price_full_year_df['departure_time'].dt.date
flight_price_full_year_df['Departure Time'] = flight_price_full_year_df['departure_time'].dt.time
  
    #splitting the arrival_time column into two columns arrival_date and arrival_time_only
    # Convert arrival_time to datetime format
flight_price_full_year_df['arrival_time'] = pd.to_datetime(flight_price_full_year_df['arrival_time'])

    # Extract date and time into separate columns arrival_date and arrival_time
flight_price_full_year_df['Arrival Date'] = flight_price_full_year_df['arrival_time'].dt.date
flight_price_full_year_df['Arrival Time'] = flight_price_full_year_df['arrival_time'].dt.time


##### Remove unwanted columns from flight price dataset.

In [14]:
columns_to_drop = [
        "sort_by", "price_raw", "origin_city", "origin_country", "currency",
        "destination_city", "destination_country", "change_allowed", 
        "cancellation_allowed", "has_flexible_options","date","departure_time", "arrival_time"
    ]
flight_price_full_year_df = flight_price_full_year_df.drop(columns=columns_to_drop, errors="ignore")
flight_price_full_year_df.reset_index(drop=True, inplace=True)
flight_price_full_year_df

Unnamed: 0,itinerary_id,cabin_class,price_formatted,flight_number,origin_airport,destination_airport,duration_minutes,stop_count,marketing_airline,operating_airline,is_self_transfer,score,Departure Date,Departure Time,Arrival Date,Arrival Time
0,18467-2504011020--31679-1-10968-2504011436,economy,C$477,5545,Toronto Pearson International,Detroit Wayne County,79,1,WestJet,SkyWest DBA Delta Connection,False,0.999000,2025-04-01,10:20:00,2025-04-01,11:39:00
1,18467-2504011020--31679-1-10968-2504011436,economy,C$477,8400,Detroit Wayne County,Dallas Fort Worth International,176,1,WestJet,Delta,False,0.999000,2025-04-01,12:40:00,2025-04-01,14:36:00
2,18467-2504010600--31679-1-10968-2504011037,economy,C$477,6342,Toronto Pearson International,Detroit Wayne County,92,1,WestJet,Endeavor Air DBA Delta Connection,False,0.611641,2025-04-01,06:00:00,2025-04-01,07:32:00
3,18467-2504010600--31679-1-10968-2504011037,economy,C$477,7021,Detroit Wayne County,Dallas Fort Worth International,186,1,WestJet,Delta,False,0.611641,2025-04-01,08:31:00,2025-04-01,10:37:00
4,18467-2504011320--31679-1-10968-2504011758,economy,C$477,6343,Toronto Pearson International,Detroit Wayne County,76,1,WestJet,Endeavor Air DBA Delta Connection,False,0.627755,2025-04-01,13:20:00,2025-04-01,14:36:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22124,18467-2509301540--32573-1-10968-2509302150,first,C$582,3108,Charlotte Douglas,Dallas Fort Worth International,155,1,American Airlines,American Airlines,False,0.352783,2025-09-30,20:15:00,2025-09-30,21:50:00
22125,18467-2509300725--32573-1-10968-2509301342,first,C$582,4559,Toronto Pearson International,New York John F. Kennedy,105,1,American Airlines,Republic Airways AS American Eagle,False,0.380391,2025-09-30,07:25:00,2025-09-30,09:10:00
22126,18467-2509300725--32573-1-10968-2509301342,first,C$582,2697,New York John F. Kennedy,Dallas Fort Worth International,232,1,American Airlines,American Airlines,False,0.380391,2025-09-30,10:50:00,2025-09-30,13:42:00
22127,18467-2509301130--32573-1-10968-2509301841,first,C$582,3604,Toronto Pearson International,Chicago O'Hare International,115,1,American Airlines,Envoy Air As American Eagle,False,0.362377,2025-09-30,11:30:00,2025-09-30,12:25:00


##### Retrieve data of selected airports only. 

In [15]:
# Select only specific airports data in the flight_price dataset
selected_airports = {"Toronto Island","Ottawa International",
                     "Toronto Pearson International", "Chicago O'Hare International",
                     "Dallas Fort Worth International","Denver International",
                     "Atlanta Hartsfield-Jackson","Montreal Pierre Elliott Trudeau",
                     "Los Angeles International"
                     }

selected_flight_price_data =  flight_price_full_year_df[ flight_price_full_year_df['origin_airport'].isin(selected_airports) &  flight_price_full_year_df['destination_airport'].isin(selected_airports)]

##### Confirm that all retrieved records are of selected airports.

In [16]:
# Confirming that only those records with required airports were selected 
airports = pd.Series(list(set(selected_flight_price_data['origin_airport']) | set(selected_flight_price_data['destination_airport'])))  # Union of both
airports

0         Atlanta Hartsfield-Jackson
1    Dallas Fort Worth International
2      Toronto Pearson International
3                     Toronto Island
4       Chicago O'Hare International
5               Denver International
6          Los Angeles International
7    Montreal Pierre Elliott Trudeau
8               Ottawa International
dtype: object

##### Retrieve unique airline names from marketing_airline and operating_airline columns of flight price data

In [17]:
# retrieve unique airlines from marketing airline and operating airline fields of the required data
unique_airlines = pd.Series(list(set(selected_flight_price_data['operating_airline']) | set(selected_flight_price_data['marketing_airline'])))  # Union of both
unique_airlines

0             Envoy Air As American Eagle
1               Air Canada Express - Jazz
2              SkyWest DBA United Express
3                                 WestJet
4                         Alaska Airlines
5                         Spirit Airlines
6                       American Airlines
7                     Porter Airlines Inc
8            Porter Airlines (Canada) Ltd
9                       Frontier Airlines
10                                  Delta
11                      GOL Linhas Aéreas
12                             Air Canada
13    Republic Airways DBA United Express
14                                 United
15     Republic Airways AS American Eagle
16                              Lufthansa
dtype: object

# 3. Import Airlines dataset

In [18]:
# Read in the  Airline dataset
airlines_data = pd.read_csv("Resources/raw_data/airlines_list_data.csv")
airlines_data.head()

Unnamed: 0,Name,country_code,iata_code,iata_prefix,iata_accounting,callsign,is international,iosa_registered,iosa_expiry,is_passenger,...,is_scheduled,total_aircrafts,average_fleet_age,accidents_last_5y,crashes_last_5y,website,twitter,facebook,instagram,linkedin
0,10 Tanker Air Carrier,US,,,,,,,,,...,,,,,,,,,,
1,135 Airways,US,,,,GENERAL,,,,,...,,,,,,,,,,
2,1903 Aviation,SE,,,,HIGHSCORE,,,,1.0,...,,,,,,,,,,
3,Air 1st Aviation Companies of Oklahoma,US,,,,ROUGHRIDER,,,,,...,,,,,,,,,,
4,"2 Sqn, No 1 Elementary Flying Training School",UK,,,,WYTON,,,,,...,,,,,,,,,,


##### Retrieve only required airline names (as per the unique airline names of flight price dataset) from airline dataset.

In [19]:
# Get only the required airline names from airline dataset
available_airlines = airlines_data.loc[airlines_data['iata_code'].isin(["OO", "MQ", "F9", "DL", "YX", "LH", "PD", "WS", "AA","UA","QK","AS","AC","NK","G3","P3"])]
airlines_info = available_airlines.reset_index(drop=True) 
airlines_info

Unnamed: 0,Name,country_code,iata_code,iata_prefix,iata_accounting,callsign,is international,iosa_registered,iosa_expiry,is_passenger,...,is_scheduled,total_aircrafts,average_fleet_age,accidents_last_5y,crashes_last_5y,website,twitter,facebook,instagram,linkedin
0,Alaska Airlines,US,AS,27.0,27.0,ALASKA,,1.0,2025-01-16T00:00:00.000Z,1.0,...,1.0,158.0,8.1,1.0,0.0,,,facebook.com/alaskaairlines,instagram.com/alaskaair,
1,American Airlines,US,AA,1.0,1.0,AMERICAN,,1.0,2025-07-29T00:00:00.000Z,1.0,...,1.0,684.0,10.2,26.0,0.0,,,facebook.com/aa,instagram.com/americanair,linkedin.com/company/american-airlines
2,Air Canada,CA,AC,14.0,14.0,AIRCANADA,,1.0,2024-04-18T00:00:00.000Z,1.0,...,1.0,98.0,11.7,0.0,0.0,,,facebook.com/aircanada,instagram.com/_u/aircanada,
3,Delta Air Lines,US,DL,6.0,6.0,DELTA,,1.0,2024-10-17T00:00:00.000Z,1.0,...,1.0,591.0,12.5,22.0,0.0,,,facebook.com/delta,instagram.com/delta/,linkedin.com/company/delta-air-lines
4,Envoy Air,US,MQ,93.0,93.0,ENVOY,,1.0,2025-11-18T00:00:00.000Z,1.0,...,1.0,161.0,9.6,0.0,0.0,,,facebook.com/envoyaircareers,instagram.com/envoyaircareers/,linkedin.com/company/envoyair
5,Frontier Airlines,US,F9,422.0,422.0,FRONTIERFLIGHT,,1.0,2024-01-03T00:00:00.000Z,1.0,...,1.0,83.0,3.0,4.0,0.0,,,,,
6,GOL Linhas Aereas Inteligentes,BR,G3,127.0,127.0,GOLTRANSPORTE,,1.0,,1.0,...,1.0,103.0,10.7,0.0,0.0,,,,instagram.com/voegoloficial,linkedin.com/company/gol
7,Jazz Aviation,CA,QK,,,JAZZ,,1.0,2024-10-21T00:00:00.000Z,1.0,...,1.0,85.0,13.8,3.0,0.0,,,facebook.com/JobsAtJazz,instagram.com/jazzaviationlp/,linkedin.com/company/jazz-aviation-lp/
8,Lufthansa Cargo,DE,LH,20.0,20.0,LUFTHANSACARGO,,1.0,2026-02-02T00:00:00.000Z,,...,1.0,15.0,10.7,0.0,0.0,,,facebook.com/lufthansacargoag,instagram.com/lufthansacargoag/,linkedin.com/company/lufthansa-cargo
9,Lufthansa,DE,LH,220.0,220.0,LUFTHANSA,,1.0,,1.0,...,1.0,131.0,8.9,1.0,0.0,,,facebook.com/Lufthansa,instagram.com/lufthansa/,


In [20]:
# Fetch airline column names
airlines_info.columns

Index(['Name', 'country_code', 'iata_code', 'iata_prefix', 'iata_accounting',
       'callsign', 'is international', 'iosa_registered', 'iosa_expiry',
       'is_passenger', 'is_cargo', 'is_scheduled', 'total_aircrafts',
       'average_fleet_age', 'accidents_last_5y', 'crashes_last_5y', 'website',
       'twitter', 'facebook', 'instagram', 'linkedin'],
      dtype='object')

##### Remove unwanted columns

In [21]:
fields_to_drop = [
       'iata_prefix', 'iata_accounting',
       'callsign', 'is international','is_passenger', 'is_cargo', 'website',
       'twitter', 'facebook', 'instagram', 'linkedin','crashes_last_5y'
    ]
wanted_airlines_fields = airlines_info.drop(columns=fields_to_drop, errors="ignore")
wanted_airlines_fields.reset_index(drop=True, inplace=True)
        

In [22]:
wanted_airlines_fields

Unnamed: 0,Name,country_code,iata_code,iosa_registered,iosa_expiry,is_scheduled,total_aircrafts,average_fleet_age,accidents_last_5y
0,Alaska Airlines,US,AS,1.0,2025-01-16T00:00:00.000Z,1.0,158.0,8.1,1.0
1,American Airlines,US,AA,1.0,2025-07-29T00:00:00.000Z,1.0,684.0,10.2,26.0
2,Air Canada,CA,AC,1.0,2024-04-18T00:00:00.000Z,1.0,98.0,11.7,0.0
3,Delta Air Lines,US,DL,1.0,2024-10-17T00:00:00.000Z,1.0,591.0,12.5,22.0
4,Envoy Air,US,MQ,1.0,2025-11-18T00:00:00.000Z,1.0,161.0,9.6,0.0
5,Frontier Airlines,US,F9,1.0,2024-01-03T00:00:00.000Z,1.0,83.0,3.0,4.0
6,GOL Linhas Aereas Inteligentes,BR,G3,1.0,,1.0,103.0,10.7,0.0
7,Jazz Aviation,CA,QK,1.0,2024-10-21T00:00:00.000Z,1.0,85.0,13.8,3.0
8,Lufthansa Cargo,DE,LH,1.0,2026-02-02T00:00:00.000Z,1.0,15.0,10.7,0.0
9,Lufthansa,DE,LH,1.0,,1.0,131.0,8.9,1.0


##### Lufthansa and  Lufthansa Cargo have the same iata code.We are working on passenger flights. Therefore, remove Lufthansa Cargo from the list

In [23]:
# Delete the row where the airline name is Lufthansa Cargo
wanted_airlines = wanted_airlines_fields[wanted_airlines_fields['Name'] != 'Lufthansa Cargo']
wanted_airlines

Unnamed: 0,Name,country_code,iata_code,iosa_registered,iosa_expiry,is_scheduled,total_aircrafts,average_fleet_age,accidents_last_5y
0,Alaska Airlines,US,AS,1.0,2025-01-16T00:00:00.000Z,1.0,158.0,8.1,1.0
1,American Airlines,US,AA,1.0,2025-07-29T00:00:00.000Z,1.0,684.0,10.2,26.0
2,Air Canada,CA,AC,1.0,2024-04-18T00:00:00.000Z,1.0,98.0,11.7,0.0
3,Delta Air Lines,US,DL,1.0,2024-10-17T00:00:00.000Z,1.0,591.0,12.5,22.0
4,Envoy Air,US,MQ,1.0,2025-11-18T00:00:00.000Z,1.0,161.0,9.6,0.0
5,Frontier Airlines,US,F9,1.0,2024-01-03T00:00:00.000Z,1.0,83.0,3.0,4.0
6,GOL Linhas Aereas Inteligentes,BR,G3,1.0,,1.0,103.0,10.7,0.0
7,Jazz Aviation,CA,QK,1.0,2024-10-21T00:00:00.000Z,1.0,85.0,13.8,3.0
9,Lufthansa,DE,LH,1.0,,1.0,131.0,8.9,1.0
10,Porter Airlines Inc,CA,P3,,,1.0,,,


In [24]:
wanted_airlines = wanted_airlines.rename(columns={'Name':'airline_name','iata_code':'iata', 'is_scheduled':'is_airline_passenger'})
wanted_airlines

Unnamed: 0,airline_name,country_code,iata,iosa_registered,iosa_expiry,is_airline_passenger,total_aircrafts,average_fleet_age,accidents_last_5y
0,Alaska Airlines,US,AS,1.0,2025-01-16T00:00:00.000Z,1.0,158.0,8.1,1.0
1,American Airlines,US,AA,1.0,2025-07-29T00:00:00.000Z,1.0,684.0,10.2,26.0
2,Air Canada,CA,AC,1.0,2024-04-18T00:00:00.000Z,1.0,98.0,11.7,0.0
3,Delta Air Lines,US,DL,1.0,2024-10-17T00:00:00.000Z,1.0,591.0,12.5,22.0
4,Envoy Air,US,MQ,1.0,2025-11-18T00:00:00.000Z,1.0,161.0,9.6,0.0
5,Frontier Airlines,US,F9,1.0,2024-01-03T00:00:00.000Z,1.0,83.0,3.0,4.0
6,GOL Linhas Aereas Inteligentes,BR,G3,1.0,,1.0,103.0,10.7,0.0
7,Jazz Aviation,CA,QK,1.0,2024-10-21T00:00:00.000Z,1.0,85.0,13.8,3.0
9,Lufthansa,DE,LH,1.0,,1.0,131.0,8.9,1.0
10,Porter Airlines Inc,CA,P3,,,1.0,,,


In [25]:
# Add airline_id as an index-based unique identifier
wanted_airlines.loc[:,"airline_id"] = range(1, len(wanted_airlines) + 1)
wanted_airlines.columns

Index(['airline_name', 'country_code', 'iata', 'iosa_registered',
       'iosa_expiry', 'is_airline_passenger', 'total_aircrafts',
       'average_fleet_age', 'accidents_last_5y', 'airline_id'],
      dtype='object')

In [26]:
# Reorder the columns
wanted_airlines=wanted_airlines[['airline_id', 'airline_name','iata','country_code','is_airline_passenger','total_aircrafts','average_fleet_age','accidents_last_5y','iosa_registered','iosa_expiry']]
wanted_airlines

Unnamed: 0,airline_id,airline_name,iata,country_code,is_airline_passenger,total_aircrafts,average_fleet_age,accidents_last_5y,iosa_registered,iosa_expiry
0,1,Alaska Airlines,AS,US,1.0,158.0,8.1,1.0,1.0,2025-01-16T00:00:00.000Z
1,2,American Airlines,AA,US,1.0,684.0,10.2,26.0,1.0,2025-07-29T00:00:00.000Z
2,3,Air Canada,AC,CA,1.0,98.0,11.7,0.0,1.0,2024-04-18T00:00:00.000Z
3,4,Delta Air Lines,DL,US,1.0,591.0,12.5,22.0,1.0,2024-10-17T00:00:00.000Z
4,5,Envoy Air,MQ,US,1.0,161.0,9.6,0.0,1.0,2025-11-18T00:00:00.000Z
5,6,Frontier Airlines,F9,US,1.0,83.0,3.0,4.0,1.0,2024-01-03T00:00:00.000Z
6,7,GOL Linhas Aereas Inteligentes,G3,BR,1.0,103.0,10.7,0.0,1.0,
7,8,Jazz Aviation,QK,CA,1.0,85.0,13.8,3.0,1.0,2024-10-21T00:00:00.000Z
9,9,Lufthansa,LH,DE,1.0,131.0,8.9,1.0,1.0,
10,10,Porter Airlines Inc,P3,CA,1.0,,,,,


In [27]:
# Save the airline file
wanted_airlines.to_csv("extracted_airlines.csv", index=False)

In [28]:
# Create a mapping dictionary based on the airlines dataset
airline_name_mapping = {
"WestJet" : "Westjet",
"Porter Airlines (Canada) Ltd": "Porter Airlines",
"Delta": "Delta Air Lines" ,
"American Airlines": "American Airlines",
"United":"United Airlines",
"Spirit Airlines": "Spirit Airlines",
"Air Canada": "Air Canada",
"Alaska Airlines":"Alaska Airlines",
"GOL Linhas AÃ©reas": "GOL Linhas Aereas Inteligentes",
"Frontier Airlines": "Frontier Airlines",
"Lufthansa":" Lufthansa",
"Porter Airlines Inc": " Porter Airlines Inc",
"Envoy Air As American Eagle": "Envoy Air",
"Republic Airways AS American Eagle": "American Eagle",
"Air Canada Express - Jazz": "Jazz Aviation",
"Republic Airways DBA United Express": "Republic Airline United Express",
"SkyWest DBA United Express": "SkyWest Delta Connection"
}

selected_flight_price_data.loc[:, ['marketing_airline', 'operating_airline']] = selected_flight_price_data[['marketing_airline', 'operating_airline']].replace(airline_name_mapping).reset_index()

selected_flight_price_data

Unnamed: 0,itinerary_id,cabin_class,price_formatted,flight_number,origin_airport,destination_airport,duration_minutes,stop_count,marketing_airline,operating_airline,is_self_transfer,score,Departure Date,Departure Time,Arrival Date,Arrival Time
8,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6340,Toronto Pearson International,Atlanta Hartsfield-Jackson,152,1,American Airlines,American Airlines,False,0.544887,2025-04-01,07:45:00,2025-04-01,10:17:00
9,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6482,Atlanta Hartsfield-Jackson,Dallas Fort Worth International,149,1,American Airlines,American Airlines,False,0.544887,2025-04-01,11:20:00,2025-04-01,12:49:00
16,"18390-2504010700--31954,-31825-2-10968-2504011735",economy,C$253,2205,Toronto Island,Ottawa International,59,2,American Airlines,Envoy Air,True,0.462699,2025-04-01,07:00:00,2025-04-01,07:59:00
29,18467-2504010615--32385-1-10910-2504011111,premium_economy,C$581,2662,Toronto Pearson International,Atlanta Hartsfield-Jackson,154,1,American Airlines,American Airlines,False,0.576769,2025-04-01,06:15:00,2025-04-01,08:49:00
31,18467-2504010745--32385-1-10968-2504011249,premium_economy,C$581,2988,Toronto Pearson International,Atlanta Hartsfield-Jackson,152,1,United Airlines,Air Canada,False,0.565300,2025-04-01,07:45:00,2025-04-01,10:17:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22112,18467-2509300820--31722-0-10968-2509301042,first,C$846,8169,Toronto Pearson International,Dallas Fort Worth International,202,0,,,False,0.668470,2025-09-30,08:20:00,2025-09-30,10:42:00
22113,18467-2509301310--32573-1-10968-2509301744,first,C$582,3606,Toronto Pearson International,Chicago O'Hare International,115,1,,,False,0.590065,2025-09-30,13:10:00,2025-09-30,14:05:00
22114,18467-2509301310--32573-1-10968-2509301744,first,C$582,2429,Chicago O'Hare International,Dallas Fort Worth International,149,1,,,False,0.590065,2025-09-30,15:15:00,2025-09-30,17:44:00
22127,18467-2509301130--32573-1-10968-2509301841,first,C$582,3604,Toronto Pearson International,Chicago O'Hare International,115,1,,,False,0.362377,2025-09-30,11:30:00,2025-09-30,12:25:00


# Understanding flight_price_full_year_df

In [29]:
# Display first few rows of the combined dataframe
flight_price_full_year_df.head()

Unnamed: 0,itinerary_id,cabin_class,price_formatted,flight_number,origin_airport,destination_airport,duration_minutes,stop_count,marketing_airline,operating_airline,is_self_transfer,score,Departure Date,Departure Time,Arrival Date,Arrival Time
0,18467-2504011020--31679-1-10968-2504011436,economy,C$477,5545,Toronto Pearson International,Detroit Wayne County,79,1,WestJet,SkyWest DBA Delta Connection,False,0.999,2025-04-01,10:20:00,2025-04-01,11:39:00
1,18467-2504011020--31679-1-10968-2504011436,economy,C$477,8400,Detroit Wayne County,Dallas Fort Worth International,176,1,WestJet,Delta,False,0.999,2025-04-01,12:40:00,2025-04-01,14:36:00
2,18467-2504010600--31679-1-10968-2504011037,economy,C$477,6342,Toronto Pearson International,Detroit Wayne County,92,1,WestJet,Endeavor Air DBA Delta Connection,False,0.611641,2025-04-01,06:00:00,2025-04-01,07:32:00
3,18467-2504010600--31679-1-10968-2504011037,economy,C$477,7021,Detroit Wayne County,Dallas Fort Worth International,186,1,WestJet,Delta,False,0.611641,2025-04-01,08:31:00,2025-04-01,10:37:00
4,18467-2504011320--31679-1-10968-2504011758,economy,C$477,6343,Toronto Pearson International,Detroit Wayne County,76,1,WestJet,Endeavor Air DBA Delta Connection,False,0.627755,2025-04-01,13:20:00,2025-04-01,14:36:00


In [30]:
# Retrieve all information about the flight price dataset
flight_price_full_year_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22129 entries, 0 to 22128
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   itinerary_id         22129 non-null  object 
 1   cabin_class          22129 non-null  object 
 2   price_formatted      22129 non-null  object 
 3   flight_number        22129 non-null  int64  
 4   origin_airport       22129 non-null  object 
 5   destination_airport  22129 non-null  object 
 6   duration_minutes     22129 non-null  int64  
 7   stop_count           22129 non-null  int64  
 8   marketing_airline    22129 non-null  object 
 9   operating_airline    22129 non-null  object 
 10  is_self_transfer     22129 non-null  bool   
 11  score                22129 non-null  float64
 12  Departure Date       22129 non-null  object 
 13  Departure Time       22129 non-null  object 
 14  Arrival Date         22129 non-null  object 
 15  Arrival Time         22129 non-null 

In [31]:
flight_price_full_year_df.describe()

Unnamed: 0,flight_number,duration_minutes,stop_count,score
count,22129.0,22129.0,22129.0,22129.0
mean,3137.151973,154.803199,0.984952,0.530657
std,2228.424728,53.138467,0.546797,0.206661
min,12.0,55.0,0.0,0.052416
25%,1115.0,119.0,1.0,0.386031
50%,2714.0,142.0,1.0,0.480256
75%,4980.0,186.0,1.0,0.614521
max,9552.0,1020.0,4.0,0.9998


In [32]:
# Check for missing values
flight_price_full_year_df.isnull().sum()

itinerary_id           0
cabin_class            0
price_formatted        0
flight_number          0
origin_airport         0
destination_airport    0
duration_minutes       0
stop_count             0
marketing_airline      0
operating_airline      0
is_self_transfer       0
score                  0
Departure Date         0
Departure Time         0
Arrival Date           0
Arrival Time           0
dtype: int64

In [33]:
flight_price_full_year_df.drop(columns=['duration_minutes'], inplace=True)

In [34]:
selected_flight_price_data.reset_index()

Unnamed: 0,index,itinerary_id,cabin_class,price_formatted,flight_number,origin_airport,destination_airport,duration_minutes,stop_count,marketing_airline,operating_airline,is_self_transfer,score,Departure Date,Departure Time,Arrival Date,Arrival Time
0,8,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6340,Toronto Pearson International,Atlanta Hartsfield-Jackson,152,1,American Airlines,American Airlines,False,0.544887,2025-04-01,07:45:00,2025-04-01,10:17:00
1,9,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6482,Atlanta Hartsfield-Jackson,Dallas Fort Worth International,149,1,American Airlines,American Airlines,False,0.544887,2025-04-01,11:20:00,2025-04-01,12:49:00
2,16,"18390-2504010700--31954,-31825-2-10968-2504011735",economy,C$253,2205,Toronto Island,Ottawa International,59,2,American Airlines,Envoy Air,True,0.462699,2025-04-01,07:00:00,2025-04-01,07:59:00
3,29,18467-2504010615--32385-1-10910-2504011111,premium_economy,C$581,2662,Toronto Pearson International,Atlanta Hartsfield-Jackson,154,1,American Airlines,American Airlines,False,0.576769,2025-04-01,06:15:00,2025-04-01,08:49:00
4,31,18467-2504010745--32385-1-10968-2504011249,premium_economy,C$581,2988,Toronto Pearson International,Atlanta Hartsfield-Jackson,152,1,United Airlines,Air Canada,False,0.565300,2025-04-01,07:45:00,2025-04-01,10:17:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11122,22112,18467-2509300820--31722-0-10968-2509301042,first,C$846,8169,Toronto Pearson International,Dallas Fort Worth International,202,0,,,False,0.668470,2025-09-30,08:20:00,2025-09-30,10:42:00
11123,22113,18467-2509301310--32573-1-10968-2509301744,first,C$582,3606,Toronto Pearson International,Chicago O'Hare International,115,1,,,False,0.590065,2025-09-30,13:10:00,2025-09-30,14:05:00
11124,22114,18467-2509301310--32573-1-10968-2509301744,first,C$582,2429,Chicago O'Hare International,Dallas Fort Worth International,149,1,,,False,0.590065,2025-09-30,15:15:00,2025-09-30,17:44:00
11125,22127,18467-2509301130--32573-1-10968-2509301841,first,C$582,3604,Toronto Pearson International,Chicago O'Hare International,115,1,,,False,0.362377,2025-09-30,11:30:00,2025-09-30,12:25:00


In [35]:
selected_flight_price_data.to_csv ('extracted_flight_price_data.csv', index = False, header=True)   

In [36]:
unique_itineraries = selected_flight_price_data['itinerary_id'].unique()
unique_itineraries

array(['18467-2504010745--31679-1-10968-2504011249',
       '18390-2504010700--31954,-31825-2-10968-2504011735',
       '18467-2504010615--32385-1-10910-2504011111', ...,
       '18467-2509301635--31722-0-10968-2509301857',
       '18467-2509300820--31722-0-10968-2509301042',
       '18467-2509301130--32573-1-10968-2509301841'], dtype=object)

In [37]:
# Create a DataFrame
itinerary_df = pd.DataFrame({"itineraries": unique_itineraries})
itinerary_df
# Add a new column with sequential labels ('it1', 'it2', ...)
itinerary_df['itinerary_id'] = [f"it{i+1}" for i in range(len(itinerary_df))]
itinerary_df = itinerary_df[['itinerary_id','itineraries']]
itinerary_df

Unnamed: 0,itinerary_id,itineraries
0,it1,18467-2504010745--31679-1-10968-2504011249
1,it2,"18390-2504010700--31954,-31825-2-10968-2504011735"
2,it3,18467-2504010615--32385-1-10910-2504011111
3,it4,18467-2504010745--32385-1-10968-2504011249
4,it5,18467-2504011220--32385-1-10910-2504011753
...,...,...
6322,it6323,18467-2509301320--32385-1-10968-2509301833
6323,it6324,18467-2509301130--32385-1-10968-2509301647
6324,it6325,18467-2509301635--31722-0-10968-2509301857
6325,it6326,18467-2509300820--31722-0-10968-2509301042
