In [86]:
import pandas as pd
import os
import random
import csv


In [87]:
# Take the large dataset and create a smaller subset of it.

input_file_path = '../data/FlightData.csv'
reduced_flight_data_path = '../data/ReducedFlightData.csv'

if os.path.exists(reduced_flight_data_path):
    os.remove(reduced_flight_data_path)

# choose how many rows to copy
rows_to_copy = 6000

with open(input_file_path, 'r') as input_file, open(reduced_flight_data_path, 'w', newline='') as output_file:
    csv_reader = csv.reader(input_file)
    csv_writer = csv.writer(output_file)

    header = next(csv_reader)
    csv_writer.writerow(header)

    all_rows = list(csv_reader)

    rows_to_copy = min(rows_to_copy, len(all_rows))

    random_rows = random.sample(all_rows, rows_to_copy)
    for row in random_rows:
        csv_writer.writerow(row)

print(f'Copied {rows_to_copy} random rows from {input_file_path} to {reduced_flight_data_path}')



Copied 6000 random rows from ../data/FlightData.csv to ../data/ReducedFlightData.csv


In [88]:
# get the large dataset 
data = pd.read_csv('../data/ReducedFlightData.csv')
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   from_airport_code                6000 non-null   object 
 1   from_country                     6000 non-null   object 
 2   dest_airport_code                6000 non-null   object 
 3   dest_country                     6000 non-null   object 
 4   aircraft_type                    5905 non-null   object 
 5   airline_number                   6000 non-null   object 
 6   airline_name                     6000 non-null   object 
 7   flight_number                    6000 non-null   object 
 8   departure_time                   6000 non-null   object 
 9   arrival_time                     6000 non-null   object 
 10  duration                         6000 non-null   int64  
 11  stops                            6000 non-null   int64  
 12  price               

In [89]:
# Handle missing values on the 'aircraft_type' column

# Method 1: Delete rows with missing values on this column

data = data.dropna(subset=['aircraft_type'])

In [90]:
# Handle missing values on the 'price' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['price'])

In [91]:
# Handle missing values on the 'co2_emissions' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['co2_emissions'])

In [92]:
# Handle missing values on the 'avg_co2_emission_for_this_route' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['avg_co2_emission_for_this_route'])


In [93]:
# Handle missing values on the 'co2_percentage' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['co2_percentage'])

In [94]:
# Fix the data types of the columns

# Set the date columns to the correct data type
data[['departure_time', 'arrival_time','scan_date']] = data[['departure_time', 'arrival_time','scan_date']].apply(pd.to_datetime)

# set the categorical columns to the correct data type
data[['from_airport_code', 'dest_airport_code', 'from_country','dest_country','aircraft_type','airline_number','airline_name','flight_number']] = data[['from_airport_code', 'dest_airport_code', 'from_country','dest_country','aircraft_type','airline_number','airline_name','flight_number']].astype('category')

# set the numeric columns to the correct data type
data['co2_percentage'] = data['co2_percentage'].str.replace('%', '').astype('float64')


In [95]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5356 entries, 0 to 5999
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   from_airport_code                5356 non-null   category      
 1   from_country                     5356 non-null   category      
 2   dest_airport_code                5356 non-null   category      
 3   dest_country                     5356 non-null   category      
 4   aircraft_type                    5356 non-null   category      
 5   airline_number                   5356 non-null   category      
 6   airline_name                     5356 non-null   category      
 7   flight_number                    5356 non-null   category      
 8   departure_time                   5356 non-null   datetime64[ns]
 9   arrival_time                     5356 non-null   datetime64[ns]
 10  duration                         5356 non-null   int64         
 

In [96]:
# Delete the old reduced flight data file and create a new one with the data without missing values

no_null_flight_data_path = '../data/NoNullFlightData.csv'

os.remove(reduced_flight_data_path)

if os.path.exists(no_null_flight_data_path):
    os.remove(no_null_flight_data_path)
    
data.to_csv(no_null_flight_data_path, index=False)

In [97]:
data.head()

Unnamed: 0,from_airport_code,from_country,dest_airport_code,dest_country,aircraft_type,airline_number,airline_name,flight_number,departure_time,arrival_time,duration,stops,price,currency,co2_emissions,avg_co2_emission_for_this_route,co2_percentage,scan_date
0,ADD,Ethiopia,CDG,France,Boeing 737|Boeing 737|Airbus A321,multi,[EgyptAir| Lufthansa],MS852|MS852|MS852,2022-05-01 04:10:00,2022-05-01 18:40:00,930,2,1764.0,USD,705000.0,579000.0,21.0,2022-04-29 17:52:59
1,BRU,Belgium,LIS,Portugal,Canadair RJ 900|Airbus A319,multi,[Lufthansa| Tap Air Portugal],LH1013|LH1013,2022-05-14 14:40:00,2022-05-14 22:10:00,510,1,331.0,USD,300000.0,183000.0,64.0,2022-04-29 17:52:59
2,CPH,Denmark,CDG,France,Boeing 737|Boeing 737,KL,[KLM],KL1134|KL1134,2022-05-01 19:10:00,2022-05-02 08:40:00,810,1,193.0,USD,158000.0,126000.0,25.0,2022-04-29 17:52:59
3,DUB,Dublin,ALG,Algeria,Airbus A320|Airbus A318,multi,[Aer Lingus| Air France],EI514|EI514,2022-05-03 12:55:00,2022-05-04 13:10:00,1455,1,2049.0,USD,287000.0,294000.0,-2.0,2022-04-29 17:52:59
4,YYZ,Canada,MEX,Mexico,Boeing 737|Boeing 737MAX 8 Passenger,multi,[WestJet| Aeromexico],WS707|WS707,2022-05-29 10:00:00,2022-05-29 22:20:00,800,1,547.0,USD,675000.0,414000.0,62.0,2022-04-29 17:52:59


In [99]:
data.describe()

Unnamed: 0,departure_time,arrival_time,duration,stops,price,co2_emissions,avg_co2_emission_for_this_route,co2_percentage,scan_date
count,5356,5356,5356.0,5356.0,5356.0,5356.0,5356.0,5356.0,5356
mean,2022-06-10 01:58:23.020911104,2022-06-11 00:35:15.660941056,1432.308626,1.634429,1694.959671,1089716.0,861506.5,42.308439,2022-04-29 17:52:59.000000256
min,2022-04-30 02:40:00,2022-04-30 09:35:00,55.0,0.0,45.0,62000.0,53000.0,-39.0,2022-04-29 17:52:59
25%,2022-05-03 20:55:00,2022-05-05 07:41:15,960.0,1.0,621.0,517750.0,414000.0,0.0,2022-04-29 17:52:59
50%,2022-05-15 14:07:30,2022-05-16 15:55:00,1375.0,2.0,1170.0,939500.0,876000.0,12.0,2022-04-29 17:52:59
75%,2022-07-29 02:10:00,2022-07-29 20:05:00,1830.0,2.0,2014.5,1348250.0,1186500.0,38.0,2022-04-29 17:52:59
max,2022-08-28 23:45:00,2022-08-31 13:30:00,4985.0,5.0,27044.0,9519000.0,2344000.0,797.0,2022-04-29 17:52:59
std,,,672.568052,0.630417,1878.098262,929982.0,526935.6,90.238169,
