## Data Preprocessing

In [None]:
# pip install openap

In [None]:
import pandas as pd
import pandas as pd
from pyproj import Geod
import json
from matplotlib import pyplot as plt
from openap import prop

In [None]:
# connect to data source
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import data
df_path = '/content/drive/MyDrive/CapstoneProject/ryanair_clean_datasets/train_extended.csv'
df = pd.read_csv(df_path)

### Data Cleaning

In [None]:
# Remove rows with negative or zero burnoff
df = df[df["Burnoff"] > 0]

# Remove rows with BlockTime too high
df = df[df["BlockTime"] < 500]

# Remove data registered after arrival
df = df.drop(['ActualRoute','DepartureActual', 'ArrivalActual', 'BlockTime', 'TaxiOut', 'FlightTime'], axis=1)

### Feature Engineering

FuelWeight

In [None]:
df["FuelWeight"] = df['PlannedTOW'] - df['PlannedZeroFuelWeight']

In [None]:
df['FuelWeight'].head(5)

0    6481.0
1    6730.0
2    4469.0
3    5603.0
4    9374.0
Name: FuelWeight, dtype: float64

Distance (Distance between origin and destination)

In [None]:
# import airport data
geod = Geod(ellps="WGS84")
airports_path ='/content/drive/MyDrive/CapstoneProject/ryanair_clean_datasets/airports.json'
airports_raw = json.load(open(airports_path))

airports = {}
for airport in airports_raw:
    airports[airport["iata_code"]] = airport

In [None]:
def distance_route(route_string):
    """Return the distance of a route in km."""
    route_strings = route_string.split("-")

    lats = []
    lons = []
    for string in route_strings:
        try:
            airport = airports[string]
        except KeyError:
            return None

        lats.append(airport['_geoloc']["lat"])
        lons.append(airport['_geoloc']["lng"])

    _, _, d = geod.inv(lons[0], lats[0], lons[1], lats[1])
    return d # Euclidean distance in km

df["Distance"] = df["ScheduledRoute"].apply(distance_route)

In [None]:
# Fill in NaN with distance age for future model fitting
mean_distance = df['Distance'].mean()
df['Distance'].fillna(mean_distance, inplace=True)

In [None]:
df['Distance'].head(5)

0    1.582254e+06
1    1.186851e+06
2    4.939575e+05
3    8.125453e+05
4    1.766642e+06
Name: Distance, dtype: float64

Aircraft Age

In [None]:
fleet_path = '/content/drive/MyDrive/CapstoneProject/ryanair_clean_datasets/fleet.csv'
fleet_df = pd.read_csv(fleet_path)

In [None]:
df = pd.merge(df, fleet_df[['Reg', 'Delivered']], left_on='AircraftRegistration', right_on='Reg', how='left')
df.drop('Reg', axis=1, inplace=True) # Drop redundant column

In [None]:
df['DepartureScheduled'] = pd.to_datetime(df['DepartureScheduled'])
df['DepartureYear'] = df['DepartureScheduled'].dt.year

df['DeliveredYear'] = df['Delivered'].str.extract('(\d{4})').astype(float)

In [None]:
# Create Aircraft Age Variable
df['AircraftAge'] = df['DepartureYear'] - df['DeliveredYear']

In [None]:
df = df.drop(['Delivered', 'DepartureYear', 'DeliveredYear'], axis=1) # Remove temporary variables

In [None]:
# Fill in NaN with average age for future model fitting
mean_age = df['AircraftAge'].mean()
df['AircraftAge'].fillna(mean_age, inplace=True)

In [None]:
df['AircraftAge'].head(5)

0    0.000000
1    0.000000
2    1.000000
3    0.000000
4    6.229273
Name: AircraftAge, dtype: float64

In [None]:
# Save df with new features
df.to_csv('df_fe.csv', index=False)
df.shape

(774658, 25)