### For this project, the flight price dataset is available from specific US and Canada airports to specific US airports.
### The data is available from March 2025 to 10th February, 2026. 
### The data is retrieved using some of the API endpoints of Goflightlab.

# Data cleaning strategy
### Save the selected airport's details, extracted from the airport_data file into a csv, after cleaning the dataset.
### Save the required airlines information, extracted from the airlines_list file into a csv, after cleaning the dataset.
### Save the flight price dataset in a csv, after necessary cleaning.

In [1]:
# import dependencies
# Import the pandas library for data manipulation and analysis
import pandas as pd
# Import the glob module to find all file paths matching a pattern
import glob

# 1. Import Canada and US airports dataset. 


### Concatenate Canada and US airports information. From the concatenated dataset, retrieve only the required airports. They are
#### CANADAN AIRPORTS  --- 1.Toronto Island(YTZ)      2.Ottawa International(YOW)       3.Toronto Pearson International(YYZ)                       
#### 4.Montreal Pierre Elliott Trudeau (YUL)                                                                        
#### US AIRPORTS --- 1.Dallas Fort Worth International(DFW)  2.Denver International(DEN)   3.Atlanta Hartsfield-Jackson(ATL)  4.Chicago O'Hare International(ORD)
#### 5.Los Angeles International(LAX)

In [2]:
# Read in the Canada airport data
canada_airport_data = pd.read_csv("Resources/raw_data/airport_data/airport_data_CA.csv")
canada_airport_data.head()

Unnamed: 0,Name,IATA Code,ICAO Code,Country,City,City Code,Latitude,Longitude,Timezone,Departures
0,Abbotsford International Airport,YXX,CYXX,CA,Abbotsford,YVR,49.02529,-122.37735,America/Vancouver,941.0
1,Aklavik/Freddie Carmichael Airport,LAK,CYKD,CA,Aklavik,LAK,68.22333,-135.00583,America/Yellowknife,348.0
2,Akulivik Airport,AKV,CYKO,CA,Akulivik,AKV,60.81861,-78.14861,America/Toronto,270.0
3,Alberni Valley Regional Airport,YPB,,CA,Port Alberni,YPB,49.31933,-124.92979,America/Vancouver,
4,Alert Bay Airport,YAL,CYAL,CA,Alert Bay,,50.5822,-126.916,America/Vancouver,


In [3]:
# Read in the US airport data
us_airport_data = pd.read_csv("Resources/raw_data/airport_data/airport_data_US.csv")
us_airport_data.head()

Unnamed: 0,Name,IATA Code,ICAO Code,Country,City,City Code,Latitude,Longitude,Timezone,Departures
0,A L Mangham Jr. Regional Airport,OCH,KOCH,US,Nacogdoches,OCH,31.57788,-94.70668,America/Chicago,
1,A P Hill Army Airfield (Fort A P Hill),APH,KAPH,US,Bowling Green,,38.06968,-77.31809,America/New_York,
2,Aban Uqua Airportse,QKA,KQKA,US,KXXQO City,,-14.735092,-91.112751,Etc/GMT+6,
3,Abbeville Chris Crusta Memorial Airport,,KIYA,US,New Iberia,ARA,29.97161,-92.0847,America/Chicago,
4,Aberdeen Regional Airport,ABR,KABR,US,Aberdeen,ABR,45.4497,-98.42148,America/Chicago,374.0


In [4]:
# concat canada and us airport data
merged_canada_us_airport_data = pd.concat([canada_airport_data, us_airport_data])
merged_canada_us_airport_data.head()

Unnamed: 0,Name,IATA Code,ICAO Code,Country,City,City Code,Latitude,Longitude,Timezone,Departures
0,Abbotsford International Airport,YXX,CYXX,CA,Abbotsford,YVR,49.02529,-122.37735,America/Vancouver,941.0
1,Aklavik/Freddie Carmichael Airport,LAK,CYKD,CA,Aklavik,LAK,68.22333,-135.00583,America/Yellowknife,348.0
2,Akulivik Airport,AKV,CYKO,CA,Akulivik,AKV,60.81861,-78.14861,America/Toronto,270.0
3,Alberni Valley Regional Airport,YPB,,CA,Port Alberni,YPB,49.31933,-124.92979,America/Vancouver,
4,Alert Bay Airport,YAL,CYAL,CA,Alert Bay,,50.5822,-126.916,America/Vancouver,


##### Filter the merged airport data to get the required Canadian and US airportS

In [5]:
# Filter the merged Canada-US airport dataset to include only the busiest airports based on their IATA codes.
can_us_busiest_airport = merged_canada_us_airport_data.loc[merged_canada_us_airport_data['IATA Code'].isin(["YUL", "YYZ", "YTZ", "YOW", "YKF", "ATL", "DFW", "DEN", "ORD","LAX"])]

#Reset the index of the filtered DataFrame to ensure it starts from 0 and remove the old index
airports_df = can_us_busiest_airport.reset_index(drop=True) 

# Display the filtered DataFrame
airports_df.head()


Unnamed: 0,Name,IATA Code,ICAO Code,Country,City,City Code,Latitude,Longitude,Timezone,Departures
0,Billy Bishop Toronto City Airport,YTZ,CYTZ,CA,Toronto,YTO,43.62974,-79.39828,America/Toronto,752.0
1,Kitchener/Waterloo Airport,YKF,CYKF,CA,Breslau,YKF,43.45747,-80.38593,America/Toronto,454.0
2,Montreal-Pierre Elliott Trudeau International ...,YUL,CYUL,CA,Montreal,YMQ,45.46106,-73.75019,America/Toronto,35190.0
3,Ottawa Macdonald-Cartier International Airport,YOW,CYOW,CA,Ottawa,YOW,45.3225,-75.66917,America/Toronto,10293.0
4,Toronto Pearson International Airport,YYZ,CYYZ,CA,Toronto,YTO,43.68066,-79.61286,America/Toronto,72355.0


##### Drop unwanted columns

In [6]:
# Define a list of columns to drop from the DataFrame
column_to_drop = ["ICAO Code","City Code"]

# Drop the specified columns from the filtered_airports DataFrame. The "errors='ignore'" parameter ensures that if the columns are not found, no error is raised
airport_df = airports_df.drop(columns=column_to_drop, errors="ignore")

# Reset the index of the DataFrame to maintain sequential ordering. The "inplace=True" parameter modifies the DataFrame directly without creating a copy
airport_df.reset_index(drop=True, inplace=True)
airport_df.head()

Unnamed: 0,Name,IATA Code,Country,City,Latitude,Longitude,Timezone,Departures
0,Billy Bishop Toronto City Airport,YTZ,CA,Toronto,43.62974,-79.39828,America/Toronto,752.0
1,Kitchener/Waterloo Airport,YKF,CA,Breslau,43.45747,-80.38593,America/Toronto,454.0
2,Montreal-Pierre Elliott Trudeau International ...,YUL,CA,Montreal,45.46106,-73.75019,America/Toronto,35190.0
3,Ottawa Macdonald-Cartier International Airport,YOW,CA,Ottawa,45.3225,-75.66917,America/Toronto,10293.0
4,Toronto Pearson International Airport,YYZ,CA,Toronto,43.68066,-79.61286,America/Toronto,72355.0


##### Update TimeZone column

In [7]:
# Replace specific timezone values in the 'Timezone' column with corresponding standard time zone abbreviations
airport_df['Timezone'] = airport_df['Timezone'].replace({'America/Toronto': 'EST',
                                                                       'America/Chicago': 'CST',
                                                                       'America/Denver': 'MST',
                                                                       'America/New_York':'EST', 
                                                                        'America/Los_Angeles':'PST'                                                                           
                                                                       })

##### Convert column headers to lower case

In [8]:
# converts all column names in the airport_df DataFrame to lowercase.
airport_df.columns = airport_df.columns.str.lower()
airport_df.head()

Unnamed: 0,name,iata code,country,city,latitude,longitude,timezone,departures
0,Billy Bishop Toronto City Airport,YTZ,CA,Toronto,43.62974,-79.39828,EST,752.0
1,Kitchener/Waterloo Airport,YKF,CA,Breslau,43.45747,-80.38593,EST,454.0
2,Montreal-Pierre Elliott Trudeau International ...,YUL,CA,Montreal,45.46106,-73.75019,EST,35190.0
3,Ottawa Macdonald-Cartier International Airport,YOW,CA,Ottawa,45.3225,-75.66917,EST,10293.0
4,Toronto Pearson International Airport,YYZ,CA,Toronto,43.68066,-79.61286,EST,72355.0


##### Get all information about the dataframe

In [9]:
airport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        10 non-null     object 
 1   iata code   10 non-null     object 
 2   country     10 non-null     object 
 3   city        10 non-null     object 
 4   latitude    10 non-null     float64
 5   longitude   10 non-null     float64
 6   timezone    10 non-null     object 
 7   departures  10 non-null     float64
dtypes: float64(3), object(5)
memory usage: 768.0+ bytes


##### Remove all leading and trailing spaces.lambda converts all columns to object type.Therefore
##### Convert departures to int type.
##### Convert latitude and longitude to float type.

In [12]:
# Remove Leading & Trailing Spaces
airport_df = airport_df.astype(str).apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
# Convert 'departures' to numeric, forcing errors to NaN (if there are any non-numeric values)
airport_df['departures'] = pd.to_numeric(airport_df['departures'], errors='coerce')
# Convert to int64 after numeric conversion
airport_df['departures'] = airport_df['departures'].astype('int64')
airport_df['latitude'] = airport_df['departures'].astype(float)
airport_df['longitude'] = airport_df['longitude'].astype(float)

In [13]:
airport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        10 non-null     object 
 1   iata code   10 non-null     object 
 2   country     10 non-null     object 
 3   city        10 non-null     object 
 4   latitude    10 non-null     float64
 5   longitude   10 non-null     float64
 6   timezone    10 non-null     object 
 7   departures  10 non-null     int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 768.0+ bytes


##### Save the cleaned file

In [49]:
airport_df.to_csv('cleaning_output/cleaned_extracted_airports.csv', index=False)

# 2. Import flight price dataset 
##### Loop through each month's flight price data to get a year  dataset.
##### Convert duration in minutes to duration in hours. 
##### Drop unwanted columns.
##### Retrieve only the required airports dataset.
##### Confirm that only the required airports data is retrieved.
##### Map airport names to their IATA code.
##### Find the unique airline names from marketing_airline and operating_airline columns.


In [14]:
# Define the pattern to match all the flight price CSV files
file_pattern = "Resources/raw_data/flight_price_data/flight_data_full_*.csv"

In [15]:
# Get a list of matching CSV file names
csv_files = glob.glob(file_pattern)
csv_files

['Resources/raw_data/flight_price_data\\flight_data_full_April.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_August.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_December.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_February2026.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_January2026.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_July.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_June.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_March.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_May.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_November.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_October.csv',
 'Resources/raw_data/flight_price_data\\flight_data_full_September.csv']

In [16]:

# Create an empty list to store dataframes
flight_price_full_year = []
# Loop through each file and read it into a dataframe
for file in csv_files:
    # Read CSV
    flight_price_df = pd.read_csv(file)  

    # Append dataframe to list
    flight_price_full_year.append(flight_price_df) 

In [17]:
# Combine all dataframes in the list into a single DataFrame
flight_price_full_year_df = pd.concat(flight_price_full_year, ignore_index=True)
flight_price_full_year_df.head()

Unnamed: 0,date,itinerary_id,cabin_class,sort_by,price_raw,price_formatted,currency,flight_number,origin_airport,origin_city,...,arrival_time,duration_minutes,stop_count,marketing_airline,operating_airline,change_allowed,cancellation_allowed,is_self_transfer,has_flexible_options,score
0,4/1/2025,18467-2504011020--31679-1-10968-2504011436,economy,fastest,476.6,C$477,,5545,Toronto Pearson International,,...,2025-04-01T11:39:00,79,1,WestJet,SkyWest DBA Delta Connection,False,False,False,False,0.999
1,4/1/2025,18467-2504011020--31679-1-10968-2504011436,economy,fastest,476.6,C$477,,8400,Detroit Wayne County,,...,2025-04-01T14:36:00,176,1,WestJet,Delta,False,False,False,False,0.999
2,4/1/2025,18467-2504010600--31679-1-10968-2504011037,economy,fastest,476.6,C$477,,6342,Toronto Pearson International,,...,2025-04-01T07:32:00,92,1,WestJet,Endeavor Air DBA Delta Connection,False,False,False,False,0.611641
3,4/1/2025,18467-2504010600--31679-1-10968-2504011037,economy,fastest,476.6,C$477,,7021,Detroit Wayne County,,...,2025-04-01T10:37:00,186,1,WestJet,Delta,False,False,False,False,0.611641
4,4/1/2025,18467-2504011320--31679-1-10968-2504011758,economy,fastest,476.6,C$477,,6343,Toronto Pearson International,,...,2025-04-01T14:36:00,76,1,WestJet,Endeavor Air DBA Delta Connection,False,False,False,False,0.627755


##### Converting duration from minutes to hours

In [18]:
flight_price_full_year_df['duration_in_hours'] = (flight_price_full_year_df['duration_minutes']/60).round(2)

##### Remove unwanted columns from flight price dataset.

In [19]:
columns_to_drop = [
        "sort_by", "price_raw", "origin_city", "origin_country", "currency",
        "destination_city", "destination_country", "change_allowed", 
        "cancellation_allowed", "has_flexible_options","date","duration_minutes","score"
    ]
flight_price_full_year_df = flight_price_full_year_df.drop(columns=columns_to_drop, errors="ignore")
flight_price_full_year_df.reset_index(drop=True, inplace=True)
flight_price_full_year_df.head()

Unnamed: 0,itinerary_id,cabin_class,price_formatted,flight_number,origin_airport,destination_airport,departure_time,arrival_time,stop_count,marketing_airline,operating_airline,is_self_transfer,duration_in_hours
0,18467-2504011020--31679-1-10968-2504011436,economy,C$477,5545,Toronto Pearson International,Detroit Wayne County,2025-04-01T10:20:00,2025-04-01T11:39:00,1,WestJet,SkyWest DBA Delta Connection,False,1.32
1,18467-2504011020--31679-1-10968-2504011436,economy,C$477,8400,Detroit Wayne County,Dallas Fort Worth International,2025-04-01T12:40:00,2025-04-01T14:36:00,1,WestJet,Delta,False,2.93
2,18467-2504010600--31679-1-10968-2504011037,economy,C$477,6342,Toronto Pearson International,Detroit Wayne County,2025-04-01T06:00:00,2025-04-01T07:32:00,1,WestJet,Endeavor Air DBA Delta Connection,False,1.53
3,18467-2504010600--31679-1-10968-2504011037,economy,C$477,7021,Detroit Wayne County,Dallas Fort Worth International,2025-04-01T08:31:00,2025-04-01T10:37:00,1,WestJet,Delta,False,3.1
4,18467-2504011320--31679-1-10968-2504011758,economy,C$477,6343,Toronto Pearson International,Detroit Wayne County,2025-04-01T13:20:00,2025-04-01T14:36:00,1,WestJet,Endeavor Air DBA Delta Connection,False,1.27


##### Retrieve data of selected airports only. 

In [20]:
# Select only specific airports data in the flight_price dataset
selected_airports = {"Toronto Island","Ottawa International",
                     "Toronto Pearson International", "Chicago O'Hare International",
                     "Dallas Fort Worth International","Denver International",
                     "Atlanta Hartsfield-Jackson","Montreal Pierre Elliott Trudeau",
                     "Los Angeles International"
                     }

selected_flight_price_data =  flight_price_full_year_df[ flight_price_full_year_df['origin_airport'].isin(selected_airports) &  flight_price_full_year_df['destination_airport'].isin(selected_airports)]

In [78]:
selected_flight_price_data.head()

Unnamed: 0,itinerary_id,cabin_class,price_formatted,flight_number,origin_airport,destination_airport,departure_time,arrival_time,stop_count,marketing_airline,operating_airline,is_self_transfer,duration_in_hours
8,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6340,Toronto Pearson International,Atlanta Hartsfield-Jackson,2025-04-01T07:45:00,2025-04-01T10:17:00,1,WestJet,Delta,False,2.53
9,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6482,Atlanta Hartsfield-Jackson,Dallas Fort Worth International,2025-04-01T11:20:00,2025-04-01T12:49:00,1,WestJet,Delta,False,2.48
16,"18390-2504010700--31954,-31825-2-10968-2504011735",economy,C$253,2205,Toronto Island,Ottawa International,2025-04-01T07:00:00,2025-04-01T07:59:00,2,Porter Airlines (Canada) Ltd,Porter Airlines Inc,True,0.98
29,18467-2504010615--32385-1-10910-2504011111,premium_economy,C$581,2662,Toronto Pearson International,Atlanta Hartsfield-Jackson,2025-04-01T06:15:00,2025-04-01T08:49:00,1,Delta,Delta,False,2.57
31,18467-2504010745--32385-1-10968-2504011249,premium_economy,C$581,2988,Toronto Pearson International,Atlanta Hartsfield-Jackson,2025-04-01T07:45:00,2025-04-01T10:17:00,1,Delta,Delta,False,2.53


##### Confirm that all retrieved records are of selected airports.

In [21]:
# Confirming that only those records with required airports were selected 
airports = pd.Series(list(set(selected_flight_price_data['origin_airport']) | set(selected_flight_price_data['destination_airport'])))  # Union of both
airports

0      Toronto Pearson International
1                     Toronto Island
2    Montreal Pierre Elliott Trudeau
3               Denver International
4          Los Angeles International
5       Chicago O'Hare International
6         Atlanta Hartsfield-Jackson
7               Ottawa International
8    Dallas Fort Worth International
dtype: object

##### Map airport names to their IATA codes

In [22]:
# Dictionary mapping airport names to their corresponding IATA codes
iata_dict={
    "Toronto Island":"YTZ",
    "Ottawa International":"YOW",
    "Toronto Pearson International":"YYZ",
    "Chicago O'Hare International":"ORD",
    "Dallas Fort Worth International":"DFW",
    "Denver International":"DEN",
    "Atlanta Hartsfield-Jackson":"ATL",
    "Montreal Pierre Elliott Trudeau":"YUL",
    "Los Angeles International":"LAX"
}   

# Replace airport names in the 'origin_airport' column with corresponding IATA codes using the dictionary
selected_flight_price_data.loc[:, 'origin_airport'] = selected_flight_price_data['origin_airport'].map(iata_dict)

# Replace airport names in the 'destination_airport' column with corresponding IATA codes using the dictionary
selected_flight_price_data.loc[:, 'destination_airport'] = selected_flight_price_data['destination_airport'].map(iata_dict)

# Display DataFrame information
selected_flight_price_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11127 entries, 8 to 22128
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   itinerary_id         11127 non-null  object 
 1   cabin_class          11127 non-null  object 
 2   price_formatted      11127 non-null  object 
 3   flight_number        11127 non-null  int64  
 4   origin_airport       11127 non-null  object 
 5   destination_airport  11127 non-null  object 
 6   departure_time       11127 non-null  object 
 7   arrival_time         11127 non-null  object 
 8   stop_count           11127 non-null  int64  
 9   marketing_airline    11127 non-null  object 
 10  operating_airline    11127 non-null  object 
 11  is_self_transfer     11127 non-null  bool   
 12  duration_in_hours    11127 non-null  float64
dtypes: bool(1), float64(1), int64(2), object(9)
memory usage: 1.1+ MB


##### Retrieve unique airline names from marketing_airline and operating_airline columns of flight price data

In [81]:
# retrieve unique airlines from marketing airline and operating airline fields of the required data
unique_airlines = pd.Series(list(set(selected_flight_price_data['operating_airline']) | set(selected_flight_price_data['marketing_airline'])))  # Union of both
unique_airlines

0                       Frontier Airlines
1               Air Canada Express - Jazz
2                     Porter Airlines Inc
3              SkyWest DBA United Express
4                                   Delta
5                         Alaska Airlines
6                       American Airlines
7             Envoy Air As American Eagle
8                         Spirit Airlines
9                                 WestJet
10     Republic Airways AS American Eagle
11    Republic Airways DBA United Express
12                             GOL Linhas
13                             Air Canada
14                              Lufthansa
15           Porter Airlines (Canada) Ltd
16                                 United
dtype: object

# 3. Import Airlines dataset
##### Retrieve only the required airline names (as per the unique airline names from flight price dataset) from airline dataset.
##### Remove unwanted columns.
##### Rename columns.
##### Remove leading and trailing spaces.
##### Convert to datetime format. Fill missing value with NAT
##### Datatype conversion
##### Create a mapping dictionary based on the airline dataset

In [23]:
# Read in the  Airline dataset
airlines_data = pd.read_csv("Resources/raw_data/airlines_list_data.csv")
airlines_data.head()

Unnamed: 0,Name,country_code,iata_code,iata_prefix,iata_accounting,callsign,is international,iosa_registered,iosa_expiry,is_passenger,...,is_scheduled,total_aircrafts,average_fleet_age,accidents_last_5y,crashes_last_5y,website,twitter,facebook,instagram,linkedin
0,10 Tanker Air Carrier,US,,,,,,,,,...,,,,,,,,,,
1,135 Airways,US,,,,GENERAL,,,,,...,,,,,,,,,,
2,1903 Aviation,SE,,,,HIGHSCORE,,,,1.0,...,,,,,,,,,,
3,Air 1st Aviation Companies of Oklahoma,US,,,,ROUGHRIDER,,,,,...,,,,,,,,,,
4,"2 Sqn, No 1 Elementary Flying Training School",UK,,,,WYTON,,,,,...,,,,,,,,,,


##### Retrieve only the required airline names (as per the unique airline names from flight price dataset) from airline dataset.

In [24]:
# Get only the required airline names from airline dataset
available_airlines = airlines_data.loc[airlines_data['iata_code'].isin(["OO", "MQ", "F9", "DL", "YX", "LH", "PD", "WS", "AA","UA","QK","AS","AC","NK","G3","P3"])]
airlines_info = available_airlines.reset_index(drop=True) 
airlines_info.head()

Unnamed: 0,Name,country_code,iata_code,iata_prefix,iata_accounting,callsign,is international,iosa_registered,iosa_expiry,is_passenger,...,is_scheduled,total_aircrafts,average_fleet_age,accidents_last_5y,crashes_last_5y,website,twitter,facebook,instagram,linkedin
0,Alaska Airlines,US,AS,27.0,27.0,ALASKA,,1.0,2025-01-16T00:00:00.000Z,1.0,...,1.0,158.0,8.1,1.0,0.0,,,facebook.com/alaskaairlines,instagram.com/alaskaair,
1,American Airlines,US,AA,1.0,1.0,AMERICAN,,1.0,2025-07-29T00:00:00.000Z,1.0,...,1.0,684.0,10.2,26.0,0.0,,,facebook.com/aa,instagram.com/americanair,linkedin.com/company/american-airlines
2,Air Canada,CA,AC,14.0,14.0,AIRCANADA,,1.0,2024-04-18T00:00:00.000Z,1.0,...,1.0,98.0,11.7,0.0,0.0,,,facebook.com/aircanada,instagram.com/_u/aircanada,
3,Delta Air Lines,US,DL,6.0,6.0,DELTA,,1.0,2024-10-17T00:00:00.000Z,1.0,...,1.0,591.0,12.5,22.0,0.0,,,facebook.com/delta,instagram.com/delta/,linkedin.com/company/delta-air-lines
4,Envoy Air,US,MQ,93.0,93.0,ENVOY,,1.0,2025-11-18T00:00:00.000Z,1.0,...,1.0,161.0,9.6,0.0,0.0,,,facebook.com/envoyaircareers,instagram.com/envoyaircareers/,linkedin.com/company/envoyair


In [25]:
# Fetch airline column names
airlines_info.columns

Index(['Name', 'country_code', 'iata_code', 'iata_prefix', 'iata_accounting',
       'callsign', 'is international', 'iosa_registered', 'iosa_expiry',
       'is_passenger', 'is_cargo', 'is_scheduled', 'total_aircrafts',
       'average_fleet_age', 'accidents_last_5y', 'crashes_last_5y', 'website',
       'twitter', 'facebook', 'instagram', 'linkedin'],
      dtype='object')

##### Remove unwanted columns

In [26]:
fields_to_drop = [
       'iata_prefix', 'iata_accounting',
       'callsign', 'is international','is_passenger', 'is_cargo', 'website',
       'twitter', 'facebook', 'instagram', 'linkedin','crashes_last_5y'
    ]
wanted_airlines_fields = airlines_info.drop(columns=fields_to_drop, errors="ignore")
wanted_airlines_fields.reset_index(drop=True, inplace=True)
        

In [None]:
# Display first five rows
wanted_airlines_fields.head()

Unnamed: 0,Name,country_code,iata_code,iosa_registered,iosa_expiry,is_scheduled,total_aircrafts,average_fleet_age,accidents_last_5y
0,Alaska Airlines,US,AS,1.0,2025-01-16T00:00:00.000Z,1.0,158.0,8.1,1.0
1,American Airlines,US,AA,1.0,2025-07-29T00:00:00.000Z,1.0,684.0,10.2,26.0
2,Air Canada,CA,AC,1.0,2024-04-18T00:00:00.000Z,1.0,98.0,11.7,0.0
3,Delta Air Lines,US,DL,1.0,2024-10-17T00:00:00.000Z,1.0,591.0,12.5,22.0
4,Envoy Air,US,MQ,1.0,2025-11-18T00:00:00.000Z,1.0,161.0,9.6,0.0


##### Lufthansa and  Lufthansa Cargo have the same iata code.We are working on passenger flights. Therefore, remove Lufthansa Cargo from the list

In [28]:
# Delete the row where the airline name is Lufthansa Cargo
wanted_airlines = wanted_airlines_fields[wanted_airlines_fields['Name'] != 'Lufthansa Cargo']
wanted_airlines.head()

Unnamed: 0,Name,country_code,iata_code,iosa_registered,iosa_expiry,is_scheduled,total_aircrafts,average_fleet_age,accidents_last_5y
0,Alaska Airlines,US,AS,1.0,2025-01-16T00:00:00.000Z,1.0,158.0,8.1,1.0
1,American Airlines,US,AA,1.0,2025-07-29T00:00:00.000Z,1.0,684.0,10.2,26.0
2,Air Canada,CA,AC,1.0,2024-04-18T00:00:00.000Z,1.0,98.0,11.7,0.0
3,Delta Air Lines,US,DL,1.0,2024-10-17T00:00:00.000Z,1.0,591.0,12.5,22.0
4,Envoy Air,US,MQ,1.0,2025-11-18T00:00:00.000Z,1.0,161.0,9.6,0.0


##### Rename the columns

In [29]:
wanted_airlines = wanted_airlines.rename(columns={'Name':'airline_name','iata_code':'iata', 'is_scheduled':'is_airline_passenger'})
wanted_airlines.head()

Unnamed: 0,airline_name,country_code,iata,iosa_registered,iosa_expiry,is_airline_passenger,total_aircrafts,average_fleet_age,accidents_last_5y
0,Alaska Airlines,US,AS,1.0,2025-01-16T00:00:00.000Z,1.0,158.0,8.1,1.0
1,American Airlines,US,AA,1.0,2025-07-29T00:00:00.000Z,1.0,684.0,10.2,26.0
2,Air Canada,CA,AC,1.0,2024-04-18T00:00:00.000Z,1.0,98.0,11.7,0.0
3,Delta Air Lines,US,DL,1.0,2024-10-17T00:00:00.000Z,1.0,591.0,12.5,22.0
4,Envoy Air,US,MQ,1.0,2025-11-18T00:00:00.000Z,1.0,161.0,9.6,0.0


In [30]:
# Remove Leading & Trailing Spaces
airlines_df = wanted_airlines.astype(str).apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
airlines_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17 entries, 0 to 17
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   airline_name          17 non-null     object
 1   country_code          17 non-null     object
 2   iata                  17 non-null     object
 3   iosa_registered       17 non-null     object
 4   iosa_expiry           17 non-null     object
 5   is_airline_passenger  17 non-null     object
 6   total_aircrafts       17 non-null     object
 7   average_fleet_age     17 non-null     object
 8   accidents_last_5y     17 non-null     object
dtypes: object(9)
memory usage: 1.3+ KB


##### Convert the datatypes.

In [31]:
 # Convert to datetime
airlines_df['iosa_expiry'] = pd.to_datetime(airlines_df['iosa_expiry'], errors='coerce')
# Fill missing value with NAT 
airlines_df['iosa_expiry'] = airlines_df['iosa_expiry'].fillna(pd.NaT)

In [32]:
# Converting float to boolean
airlines_df[['iosa_registered','is_airline_passenger']] =airlines_df[['iosa_registered','is_airline_passenger']].astype(bool)


In [33]:
# Converting float to int
airlines_df[['total_aircrafts','accidents_last_5y']]= airlines_df[['total_aircrafts','accidents_last_5y']].apply(pd.to_numeric, errors='coerce').round(0).astype('int64')

In [37]:
# Converting object to float
airlines_df['average_fleet_age']=airlines_df['average_fleet_age'].astype(float)

In [38]:
airlines_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17 entries, 0 to 17
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   airline_name          17 non-null     object             
 1   country_code          17 non-null     object             
 2   iata                  17 non-null     object             
 3   iosa_registered       17 non-null     bool               
 4   iosa_expiry           15 non-null     datetime64[ns, UTC]
 5   is_airline_passenger  17 non-null     bool               
 6   total_aircrafts       17 non-null     int64              
 7   average_fleet_age     17 non-null     float64            
 8   accidents_last_5y     17 non-null     int64              
dtypes: bool(2), datetime64[ns, UTC](1), float64(1), int64(2), object(3)
memory usage: 1.1+ KB


Save the dataframe as csv

In [39]:
airlines_df.to_csv('cleaning_output/cleaned_extracted_airlines.csv', index=False)

In [None]:
# Create a mapping dictionary based on the airlines dataset
airline_name_mapping = {
"WestJet" : "Westjet",
"Porter Airlines (Canada) Ltd": "Porter Airlines",
"Delta": "Delta Air Lines" ,
"American Airlines": "American Airlines",
"United":"United Airlines",
"Spirit Airlines": "Spirit Airlines",
"Lufthansa":"Lufthansa",
"Frontier Airlines":"Frontier Airlines",
"Air Canada": "Air Canada",
"Alaska Airlines":"Alaska Airlines",
"GOL Linhas": "GOL Linhas Aereas Inteligentes",
"Porter Airlines Inc": " Porter Airlines Inc",
"Envoy Air As American Eagle": "Envoy Air",
"Republic Airways AS American Eagle": "American Eagle",
"Air Canada Express - Jazz": "Jazz Aviation",
"Republic Airways DBA United Express": "Republic Airline United Express",
"SkyWest DBA United Express": "SkyWest Airlines"
}

# Replace values in the 'marketing_airline' and 'operating_airline' columns using the mapping dictionary
selected_flight_price_data.loc[:, ['marketing_airline', 'operating_airline']] = selected_flight_price_data[['marketing_airline', 'operating_airline']].replace(airline_name_mapping)

# Display the first few rows of the updated DataFrame to verify the changes
selected_flight_price_data.head()

Unnamed: 0,itinerary_id,cabin_class,price_formatted,flight_number,origin_airport,destination_airport,departure_time,arrival_time,stop_count,marketing_airline,operating_airline,is_self_transfer,duration_in_hours
8,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6340,YYZ,ATL,2025-04-01T07:45:00,2025-04-01T10:17:00,1,Westjet,Delta Air Lines,False,2.53
9,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6482,ATL,DFW,2025-04-01T11:20:00,2025-04-01T12:49:00,1,Westjet,Delta Air Lines,False,2.48
16,"18390-2504010700--31954,-31825-2-10968-2504011735",economy,C$253,2205,YTZ,YOW,2025-04-01T07:00:00,2025-04-01T07:59:00,2,Porter Airlines,Porter Airlines Inc,True,0.98
29,18467-2504010615--32385-1-10910-2504011111,premium_economy,C$581,2662,YYZ,ATL,2025-04-01T06:15:00,2025-04-01T08:49:00,1,Delta Air Lines,Delta Air Lines,False,2.57
31,18467-2504010745--32385-1-10968-2504011249,premium_economy,C$581,2988,YYZ,ATL,2025-04-01T07:45:00,2025-04-01T10:17:00,1,Delta Air Lines,Delta Air Lines,False,2.53


In [42]:
# Rename column names
selected_flight_price_data.rename(columns={'price_formatted':'one_way_price',
                                   'is_self_tansfer':'self_tranfer_allowed'})

Unnamed: 0,itinerary_id,cabin_class,one_way_price,flight_number,origin_airport,destination_airport,departure_time,arrival_time,stop_count,marketing_airline,operating_airline,is_self_transfer,duration_in_hours
8,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6340,YYZ,ATL,2025-04-01T07:45:00,2025-04-01T10:17:00,1,Westjet,Delta Air Lines,False,2.53
9,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6482,ATL,DFW,2025-04-01T11:20:00,2025-04-01T12:49:00,1,Westjet,Delta Air Lines,False,2.48
16,"18390-2504010700--31954,-31825-2-10968-2504011735",economy,C$253,2205,YTZ,YOW,2025-04-01T07:00:00,2025-04-01T07:59:00,2,Porter Airlines,Porter Airlines Inc,True,0.98
29,18467-2504010615--32385-1-10910-2504011111,premium_economy,C$581,2662,YYZ,ATL,2025-04-01T06:15:00,2025-04-01T08:49:00,1,Delta Air Lines,Delta Air Lines,False,2.57
31,18467-2504010745--32385-1-10968-2504011249,premium_economy,C$581,2988,YYZ,ATL,2025-04-01T07:45:00,2025-04-01T10:17:00,1,Delta Air Lines,Delta Air Lines,False,2.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22112,18467-2509300820--31722-0-10968-2509301042,first,C$846,8169,YYZ,DFW,2025-09-30T08:20:00,2025-09-30T10:42:00,0,United Airlines,Air Canada,False,3.37
22113,18467-2509301310--32573-1-10968-2509301744,first,C$582,3606,YYZ,ORD,2025-09-30T13:10:00,2025-09-30T14:05:00,1,American Airlines,Envoy Air,False,1.92
22114,18467-2509301310--32573-1-10968-2509301744,first,C$582,2429,ORD,DFW,2025-09-30T15:15:00,2025-09-30T17:44:00,1,American Airlines,American Airlines,False,2.48
22127,18467-2509301130--32573-1-10968-2509301841,first,C$582,3604,YYZ,ORD,2025-09-30T11:30:00,2025-09-30T12:25:00,1,American Airlines,Envoy Air,False,1.92


# Understanding flight_price_full_year_df

In [43]:
# Display first few rows of the combined dataframe
selected_flight_price_data.head()

Unnamed: 0,itinerary_id,cabin_class,price_formatted,flight_number,origin_airport,destination_airport,departure_time,arrival_time,stop_count,marketing_airline,operating_airline,is_self_transfer,duration_in_hours
8,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6340,YYZ,ATL,2025-04-01T07:45:00,2025-04-01T10:17:00,1,Westjet,Delta Air Lines,False,2.53
9,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6482,ATL,DFW,2025-04-01T11:20:00,2025-04-01T12:49:00,1,Westjet,Delta Air Lines,False,2.48
16,"18390-2504010700--31954,-31825-2-10968-2504011735",economy,C$253,2205,YTZ,YOW,2025-04-01T07:00:00,2025-04-01T07:59:00,2,Porter Airlines,Porter Airlines Inc,True,0.98
29,18467-2504010615--32385-1-10910-2504011111,premium_economy,C$581,2662,YYZ,ATL,2025-04-01T06:15:00,2025-04-01T08:49:00,1,Delta Air Lines,Delta Air Lines,False,2.57
31,18467-2504010745--32385-1-10968-2504011249,premium_economy,C$581,2988,YYZ,ATL,2025-04-01T07:45:00,2025-04-01T10:17:00,1,Delta Air Lines,Delta Air Lines,False,2.53


In [44]:
selected_flight_price_data.describe()

Unnamed: 0,flight_number,stop_count,duration_in_hours
count,11127.0,11127.0,11127.0
mean,3187.528444,0.776849,2.632175
std,2440.465516,0.507973,0.670955
min,31.0,0.0,0.98
25%,1111.0,0.0,2.25
50%,2710.0,1.0,2.42
75%,4238.0,1.0,3.33
max,8909.0,4.0,5.67


In [45]:
# Check for missing values
selected_flight_price_data.isnull().sum()

itinerary_id           0
cabin_class            0
price_formatted        0
flight_number          0
origin_airport         0
destination_airport    0
departure_time         0
arrival_time           0
stop_count             0
marketing_airline      0
operating_airline      0
is_self_transfer       0
duration_in_hours      0
dtype: int64

In [46]:
selected_flight_price_data.reset_index()

Unnamed: 0,index,itinerary_id,cabin_class,price_formatted,flight_number,origin_airport,destination_airport,departure_time,arrival_time,stop_count,marketing_airline,operating_airline,is_self_transfer,duration_in_hours
0,8,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6340,YYZ,ATL,2025-04-01T07:45:00,2025-04-01T10:17:00,1,Westjet,Delta Air Lines,False,2.53
1,9,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6482,ATL,DFW,2025-04-01T11:20:00,2025-04-01T12:49:00,1,Westjet,Delta Air Lines,False,2.48
2,16,"18390-2504010700--31954,-31825-2-10968-2504011735",economy,C$253,2205,YTZ,YOW,2025-04-01T07:00:00,2025-04-01T07:59:00,2,Porter Airlines,Porter Airlines Inc,True,0.98
3,29,18467-2504010615--32385-1-10910-2504011111,premium_economy,C$581,2662,YYZ,ATL,2025-04-01T06:15:00,2025-04-01T08:49:00,1,Delta Air Lines,Delta Air Lines,False,2.57
4,31,18467-2504010745--32385-1-10968-2504011249,premium_economy,C$581,2988,YYZ,ATL,2025-04-01T07:45:00,2025-04-01T10:17:00,1,Delta Air Lines,Delta Air Lines,False,2.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11122,22112,18467-2509300820--31722-0-10968-2509301042,first,C$846,8169,YYZ,DFW,2025-09-30T08:20:00,2025-09-30T10:42:00,0,United Airlines,Air Canada,False,3.37
11123,22113,18467-2509301310--32573-1-10968-2509301744,first,C$582,3606,YYZ,ORD,2025-09-30T13:10:00,2025-09-30T14:05:00,1,American Airlines,Envoy Air,False,1.92
11124,22114,18467-2509301310--32573-1-10968-2509301744,first,C$582,2429,ORD,DFW,2025-09-30T15:15:00,2025-09-30T17:44:00,1,American Airlines,American Airlines,False,2.48
11125,22127,18467-2509301130--32573-1-10968-2509301841,first,C$582,3604,YYZ,ORD,2025-09-30T11:30:00,2025-09-30T12:25:00,1,American Airlines,Envoy Air,False,1.92


In [47]:
# Retrieve all information about the flight price dataset
selected_flight_price_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11127 entries, 8 to 22128
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   itinerary_id         11127 non-null  object 
 1   cabin_class          11127 non-null  object 
 2   price_formatted      11127 non-null  object 
 3   flight_number        11127 non-null  int64  
 4   origin_airport       11127 non-null  object 
 5   destination_airport  11127 non-null  object 
 6   departure_time       11127 non-null  object 
 7   arrival_time         11127 non-null  object 
 8   stop_count           11127 non-null  int64  
 9   marketing_airline    11127 non-null  object 
 10  operating_airline    11127 non-null  object 
 11  is_self_transfer     11127 non-null  bool   
 12  duration_in_hours    11127 non-null  float64
dtypes: bool(1), float64(1), int64(2), object(9)
memory usage: 1.1+ MB


In [48]:
# Remove Leading & Trailing Spaces
price_df =selected_flight_price_data.astype(str).apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
price_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11127 entries, 8 to 22128
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   itinerary_id         11127 non-null  object
 1   cabin_class          11127 non-null  object
 2   price_formatted      11127 non-null  object
 3   flight_number        11127 non-null  object
 4   origin_airport       11127 non-null  object
 5   destination_airport  11127 non-null  object
 6   departure_time       11127 non-null  object
 7   arrival_time         11127 non-null  object
 8   stop_count           11127 non-null  object
 9   marketing_airline    11127 non-null  object
 10  operating_airline    11127 non-null  object
 11  is_self_transfer     11127 non-null  object
 12  duration_in_hours    11127 non-null  object
dtypes: object(13)
memory usage: 1.2+ MB


In [49]:
# Renaming a single column
price_df = price_df.rename(columns={'price_formatted': 'one_way_price_candollar',
                                    'departure_time':'departure_date_time',
                                    'arrival_time':'arrival_date_time'})
price_df

Unnamed: 0,itinerary_id,cabin_class,one_way_price_candollar,flight_number,origin_airport,destination_airport,departure_date_time,arrival_date_time,stop_count,marketing_airline,operating_airline,is_self_transfer,duration_in_hours
8,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6340,YYZ,ATL,2025-04-01T07:45:00,2025-04-01T10:17:00,1,Westjet,Delta Air Lines,False,2.53
9,18467-2504010745--31679-1-10968-2504011249,economy,C$477,6482,ATL,DFW,2025-04-01T11:20:00,2025-04-01T12:49:00,1,Westjet,Delta Air Lines,False,2.48
16,"18390-2504010700--31954,-31825-2-10968-2504011735",economy,C$253,2205,YTZ,YOW,2025-04-01T07:00:00,2025-04-01T07:59:00,2,Porter Airlines,Porter Airlines Inc,True,0.98
29,18467-2504010615--32385-1-10910-2504011111,premium_economy,C$581,2662,YYZ,ATL,2025-04-01T06:15:00,2025-04-01T08:49:00,1,Delta Air Lines,Delta Air Lines,False,2.57
31,18467-2504010745--32385-1-10968-2504011249,premium_economy,C$581,2988,YYZ,ATL,2025-04-01T07:45:00,2025-04-01T10:17:00,1,Delta Air Lines,Delta Air Lines,False,2.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22112,18467-2509300820--31722-0-10968-2509301042,first,C$846,8169,YYZ,DFW,2025-09-30T08:20:00,2025-09-30T10:42:00,0,United Airlines,Air Canada,False,3.37
22113,18467-2509301310--32573-1-10968-2509301744,first,C$582,3606,YYZ,ORD,2025-09-30T13:10:00,2025-09-30T14:05:00,1,American Airlines,Envoy Air,False,1.92
22114,18467-2509301310--32573-1-10968-2509301744,first,C$582,2429,ORD,DFW,2025-09-30T15:15:00,2025-09-30T17:44:00,1,American Airlines,American Airlines,False,2.48
22127,18467-2509301130--32573-1-10968-2509301841,first,C$582,3604,YYZ,ORD,2025-09-30T11:30:00,2025-09-30T12:25:00,1,American Airlines,Envoy Air,False,1.92


In [58]:
# Remove 'c$', then remove commas and convert to integer
price_df['one_way_price_candollar'] = price_df['one_way_price_candollar'].replace({'C\$': '', ',': ''}, regex=True).astype('int64')

In [59]:
# Convert flight_number to integer
price_df[['flight_number','stop_count']] = price_df[['flight_number','stop_count']].astype('int64')

In [60]:
# Convert departure_date_time column to datetime format
price_df['departure_date_time']=pd.to_datetime(price_df['departure_date_time'], errors='coerce')

In [61]:
# Convert arrival_date_time column to datetime format
price_df['arrival_date_time']=pd.to_datetime(price_df['arrival_date_time'], errors='coerce')

In [62]:
# Convert duration_in_hours to float type
price_df['duration_in_hours'] = price_df['duration_in_hours'].astype(float)

In [63]:
# Convert is_self_transfer to boolean type
price_df['is_self_transfer'] = price_df['is_self_transfer'].astype(bool)

In [64]:
price_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11127 entries, 8 to 22128
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   itinerary_id             11127 non-null  object        
 1   cabin_class              11127 non-null  object        
 2   one_way_price_candollar  11127 non-null  int64         
 3   flight_number            11127 non-null  int64         
 4   origin_airport           11127 non-null  object        
 5   destination_airport      11127 non-null  object        
 6   departure_date_time      11127 non-null  datetime64[ns]
 7   arrival_date_time        11127 non-null  datetime64[ns]
 8   stop_count               11127 non-null  int64         
 9   marketing_airline        11127 non-null  object        
 10  operating_airline        11127 non-null  object        
 11  is_self_transfer         11127 non-null  bool          
 12  duration_in_hours        11127 non-nu

In [65]:
price_df.to_csv('cleaning_output/cleaned_price_data.csv', index=False)