In [1]:
import pandas as pd
import os
import random
import csv


In [2]:
# Take the large dataset and create a smaller subset of it.

input_file_path = '../data/FlightData.csv'
reduced_flight_data_path = '../data/ReducedFlightData.csv'

if os.path.exists(reduced_flight_data_path):
    os.remove(reduced_flight_data_path)

# choose how many rows to copy
rows_to_copy = 6000

# Method 1: using csv reader and writer

with open(input_file_path, 'r') as input_file, open(reduced_flight_data_path, 'w', newline='') as output_file:
    csv_reader = csv.reader(input_file)
    csv_writer = csv.writer(output_file)

    header = next(csv_reader)
    csv_writer.writerow(header)

    all_rows = list(csv_reader)

    rows_to_copy = min(rows_to_copy, len(all_rows))

    random_rows = random.sample(all_rows, rows_to_copy)
    for row in random_rows:
        csv_writer.writerow(row)

# Method 2: using pandas sample(VERY SLOW)
# pd.DataFrame.sample(
#     pd.read_csv(input_file_path),
#     n=6000,
#     random_state=42
# ).to_csv(reduced_flight_data_path, index=False)


print(f'Copied {rows_to_copy} random rows from {input_file_path} to {reduced_flight_data_path}')



Copied 6000 random rows from ../data/FlightData.csv to ../data/ReducedFlightData.csv


In [3]:
# get the large dataset 
data = pd.read_csv('../data/ReducedFlightData.csv')
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   from_airport_code                6000 non-null   object 
 1   from_country                     6000 non-null   object 
 2   dest_airport_code                6000 non-null   object 
 3   dest_country                     6000 non-null   object 
 4   aircraft_type                    5918 non-null   object 
 5   airline_number                   6000 non-null   object 
 6   airline_name                     6000 non-null   object 
 7   flight_number                    6000 non-null   object 
 8   departure_time                   6000 non-null   object 
 9   arrival_time                     6000 non-null   object 
 10  duration                         6000 non-null   int64  
 11  stops                            6000 non-null   int64  
 12  price               

In [4]:
# Handle missing values on the 'aircraft_type' column

# Method 1: Delete rows with missing values on this column

data = data.dropna(subset=['aircraft_type'])

In [5]:
# Handle missing values on the 'price' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['price'])

In [6]:
# Handle missing values on the 'co2_emissions' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['co2_emissions'])

In [7]:
# Handle missing values on the 'avg_co2_emission_for_this_route' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['avg_co2_emission_for_this_route'])


In [8]:
# Handle missing values on the 'co2_percentage' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['co2_percentage'])

In [9]:
# Fix the data types of the columns

# Set the date columns to the correct data type
data[['departure_time', 'arrival_time','scan_date']] = data[['departure_time', 'arrival_time','scan_date']].apply(pd.to_datetime)

# set the categorical columns to the correct data type
data[['from_airport_code', 'dest_airport_code', 'from_country','dest_country','aircraft_type','airline_number','airline_name','flight_number']] = data[['from_airport_code', 'dest_airport_code', 'from_country','dest_country','aircraft_type','airline_number','airline_name','flight_number']].astype('category')

# set the numeric columns to the correct data type
data['co2_percentage'] = data['co2_percentage'].str.replace('%', '').astype('float64')


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5317 entries, 0 to 5999
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   from_airport_code                5317 non-null   category      
 1   from_country                     5317 non-null   category      
 2   dest_airport_code                5317 non-null   category      
 3   dest_country                     5317 non-null   category      
 4   aircraft_type                    5317 non-null   category      
 5   airline_number                   5317 non-null   category      
 6   airline_name                     5317 non-null   category      
 7   flight_number                    5317 non-null   category      
 8   departure_time                   5317 non-null   datetime64[ns]
 9   arrival_time                     5317 non-null   datetime64[ns]
 10  duration                         5317 non-null   int64         
 

In [11]:
# Delete the old reduced flight data file and create a new one with the data without missing values

no_null_flight_data_path = '../data/PreProcessedFlightData.csv'

os.remove(reduced_flight_data_path)

if os.path.exists(no_null_flight_data_path):
    os.remove(no_null_flight_data_path)
    
data.to_csv(no_null_flight_data_path, index=False)

In [12]:
data.head()

Unnamed: 0,from_airport_code,from_country,dest_airport_code,dest_country,aircraft_type,airline_number,airline_name,flight_number,departure_time,arrival_time,duration,stops,price,currency,co2_emissions,avg_co2_emission_for_this_route,co2_percentage,scan_date
0,FRA,Germany,CPH,Denmark,Airbus A321|Embraer 195,multi,[Lufthansa| Austrian],LH1238|LH1238,2022-05-07 11:50:00,2022-05-07 19:15:00,445,1,425.0,USD,227000.0,118000.0,91.0,2022-04-29 17:52:59
1,DEL,India,BLR,India,Airbus A321|Airbus A320,6E,[IndiGo],6E2014|6E2014,2022-05-01 13:25:00,2022-05-01 23:40:00,615,1,104.0,USD,208000.0,183000.0,13.0,2022-04-29 17:52:59
2,PVG,China,SZX,China,Airbus A320,ZH,[Shenzhen],ZH9512,2022-05-30 17:30:00,2022-05-30 20:10:00,160,0,223.0,USD,164000.0,164000.0,0.0,2022-04-29 17:52:59
3,SCL,Chile,ATL,United States,Boeing 787|Boeing 767|Boeing 757,multi,[LATAM| Delta],LA2376|LA2376|LA2376,2022-05-01 17:50:00,2022-05-02 16:14:00,1344,2,2062.0,USD,883000.0,827000.0,6.0,2022-04-29 17:52:59
4,BRU,Belgium,IST,Turkey,Airbus A319|Airbus A320|Airbus A321neo,multi,[Brussels Airlines| Lufthansa],SN2641|SN2641|SN2641,2022-05-14 07:00:00,2022-05-14 21:15:00,795,2,363.0,USD,333000.0,216000.0,54.0,2022-04-29 17:52:59


In [13]:
data.describe()

Unnamed: 0,departure_time,arrival_time,duration,stops,price,co2_emissions,avg_co2_emission_for_this_route,co2_percentage,scan_date
count,5317,5317,5317.0,5317.0,5317.0,5317.0,5317.0,5317.0,5317
mean,2022-06-09 01:33:04.457400832,2022-06-10 00:15:55.937558784,1442.278729,1.635133,1720.007523,1098928.0,873368.1,39.795185,2022-04-29 17:52:59.000000512
min,2022-04-30 00:25:00,2022-04-30 09:42:00,55.0,0.0,61.0,54000.0,53000.0,-40.0,2022-04-29 17:52:59
25%,2022-05-03 19:10:00,2022-05-04 22:15:00,998.0,1.0,644.0,560000.0,434000.0,0.0,2022-04-29 17:52:59
50%,2022-05-15 11:45:00,2022-05-16 11:35:00,1389.0,2.0,1209.0,966000.0,900000.0,12.0,2022-04-29 17:52:59
75%,2022-07-28 21:15:00,2022-07-29 18:40:00,1840.0,2.0,2011.0,1340000.0,1193000.0,37.0,2022-04-29 17:52:59
max,2022-08-28 23:55:00,2022-08-31 10:35:00,4515.0,5.0,24043.0,10273000.0,2408000.0,778.0,2022-04-29 17:52:59
std,,,656.528151,0.632419,1906.872323,932058.3,520279.0,84.073515,


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5317 entries, 0 to 5999
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   from_airport_code                5317 non-null   category      
 1   from_country                     5317 non-null   category      
 2   dest_airport_code                5317 non-null   category      
 3   dest_country                     5317 non-null   category      
 4   aircraft_type                    5317 non-null   category      
 5   airline_number                   5317 non-null   category      
 6   airline_name                     5317 non-null   category      
 7   flight_number                    5317 non-null   category      
 8   departure_time                   5317 non-null   datetime64[ns]
 9   arrival_time                     5317 non-null   datetime64[ns]
 10  duration                         5317 non-null   int64         
 