In [31]:
import pandas as pd
import os
import random
import csv
import plotly.express as px

In [32]:
# Take the large dataset and create a smaller subset of it.

input_file_path = '../data/FlightData.csv'
reduced_flight_data_path = '../data/ReducedFlightData.csv'

if os.path.exists(reduced_flight_data_path):
    os.remove(reduced_flight_data_path)

# choose how many rows to copy
rows_to_copy = 6000

# Method 1: using csv reader and writer

with open(input_file_path, 'r') as input_file, open(reduced_flight_data_path, 'w', newline='') as output_file:
    csv_reader = csv.reader(input_file)
    csv_writer = csv.writer(output_file)

    header = next(csv_reader)
    csv_writer.writerow(header)

    all_rows = list(csv_reader)

    rows_to_copy = min(rows_to_copy, len(all_rows))

    random_rows = random.sample(all_rows, rows_to_copy)
    for row in random_rows:
        csv_writer.writerow(row)

# Method 2: using pandas sample(VERY SLOW)
# pd.DataFrame.sample(
#     pd.read_csv(input_file_path),
#     n=6000,
#     random_state=42
# ).to_csv(reduced_flight_data_path, index=False)


print(f'Copied {rows_to_copy} random rows from {input_file_path} to {reduced_flight_data_path}')



Copied 6000 random rows from ../data/FlightData.csv to ../data/ReducedFlightData.csv


In [33]:
# get the large dataset 
data = pd.read_csv('../data/ReducedFlightData.csv')
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   from_airport_code                6000 non-null   object 
 1   from_country                     6000 non-null   object 
 2   dest_airport_code                6000 non-null   object 
 3   dest_country                     6000 non-null   object 
 4   aircraft_type                    5904 non-null   object 
 5   airline_number                   6000 non-null   object 
 6   airline_name                     6000 non-null   object 
 7   flight_number                    6000 non-null   object 
 8   departure_time                   6000 non-null   object 
 9   arrival_time                     6000 non-null   object 
 10  duration                         6000 non-null   int64  
 11  stops                            6000 non-null   int64  
 12  price               

In [34]:
# Handle missing values on the 'aircraft_type' column

# Method 1: Delete rows with missing values on this column

data = data.dropna(subset=['aircraft_type'])

In [35]:
# Handle missing values on the 'price' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['price'])

In [36]:
# Handle missing values on the 'co2_emissions' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['co2_emissions'])

In [37]:
# Handle missing values on the 'avg_co2_emission_for_this_route' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['avg_co2_emission_for_this_route'])


In [38]:
# Handle missing values on the 'co2_percentage' column

# Method 1: Delete rows with missing values on this column
data = data.dropna(subset=['co2_percentage'])

In [39]:
# Fix the data types of the columns

# Set the date columns to the correct data type
data[['departure_time', 'arrival_time','scan_date']] = data[['departure_time', 'arrival_time','scan_date']].apply(pd.to_datetime)

# set the categorical columns to the correct data type
data[['from_airport_code', 'dest_airport_code', 'from_country','dest_country','aircraft_type','airline_number','airline_name','flight_number']] = data[['from_airport_code', 'dest_airport_code', 'from_country','dest_country','aircraft_type','airline_number','airline_name','flight_number']].astype('category')

# set the numeric columns to the correct data type
data['co2_percentage'] = data['co2_percentage'].str.replace('%', '').astype('float64')


In [40]:
# Select relevant features and target variable
# drop the irrelevant column
data.drop(['departure_time','flight_number','arrival_time','scan_date','currency','co2_emissions','avg_co2_emission_for_this_route','co2_percentage'], axis=1, inplace=True)

In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5344 entries, 0 to 5998
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   from_airport_code  5344 non-null   category
 1   from_country       5344 non-null   category
 2   dest_airport_code  5344 non-null   category
 3   dest_country       5344 non-null   category
 4   aircraft_type      5344 non-null   category
 5   airline_number     5344 non-null   category
 6   airline_name       5344 non-null   category
 7   duration           5344 non-null   int64   
 8   stops              5344 non-null   int64   
 9   price              5344 non-null   float64 
dtypes: category(7), float64(1), int64(2)
memory usage: 347.5 KB


In [42]:
# Delete the old reduced flight data file and create a new one with the data without missing values

no_null_flight_data_path = '../data/PreProcessedFlightData.csv'

os.remove(reduced_flight_data_path)

if os.path.exists(no_null_flight_data_path):
    os.remove(no_null_flight_data_path)
    
data.to_csv(no_null_flight_data_path, index=False)

In [43]:
data.head()

Unnamed: 0,from_airport_code,from_country,dest_airport_code,dest_country,aircraft_type,airline_number,airline_name,duration,stops,price
0,VIE,Austria,ATH,Greece,Airbus A321|Airbus A320neo,multi,[Austrian| SAS],1700,1,347.0
1,BOG,Columbia,LGW,United Kingdom,Airbus A319|Boeing 737|Boeing 777,multi,[American| British Airways],1535,2,1838.0
2,BOG,Columbia,MIA,United States,Airbus A320|Boeing 737MAX 9 Passenger,multi,[Avianca| COPA],790,1,366.0
3,PVG,China,CDG,France,Airbus A330|Airbus A350|Airbus A319,multi,[Cathay Pacific| British Airways],1790,2,2940.0
4,ATH,Greece,LHR,United Kingdom,Airbus A320|Airbus A320neo,LH,[Lufthansa],375,1,519.0


In [44]:
data.describe()

Unnamed: 0,duration,stops,price
count,5344.0,5344.0,5344.0
mean,1432.268525,1.641467,1715.601048
std,678.980227,0.64735,1849.645611
min,55.0,0.0,45.0
25%,965.0,1.0,633.0
50%,1370.0,2.0,1203.0
75%,1830.0,2.0,2098.5
max,6095.0,6.0,19367.0


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5344 entries, 0 to 5998
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   from_airport_code  5344 non-null   category
 1   from_country       5344 non-null   category
 2   dest_airport_code  5344 non-null   category
 3   dest_country       5344 non-null   category
 4   aircraft_type      5344 non-null   category
 5   airline_number     5344 non-null   category
 6   airline_name       5344 non-null   category
 7   duration           5344 non-null   int64   
 8   stops              5344 non-null   int64   
 9   price              5344 non-null   float64 
dtypes: category(7), float64(1), int64(2)
memory usage: 347.5 KB
