In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import a parquet file as a pandas dataframe
df = pd.read_parquet('Flight_Info_2018.parquet')

In [3]:
# see a preview of the data
df.head()

Unnamed: 0,FlightDate,Airline,Origin,Dest,CRSDepTime,DepTime,AirTime,CRSElapsedTime,ActualElapsedTime,Distance,...,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,DistanceGroup,DivAirportLandings,Diverted_,Dep_Delay
0,2018-01-18T00:00,Endeavor Air Inc.,ATL,ABY,1037,1233.0,32.0,60.0,72.0,145.0,...,7.0,1000-1059,34.0,1307.0,1339.0,6.0,1,0.0,0,delayed
1,2018-01-13T00:00,Endeavor Air Inc.,ATL,EVV,941,1142.0,54.0,87.0,83.0,350.0,...,8.0,0900-0959,22.0,1204.0,1158.0,7.0,2,0.0,0,delayed
2,2018-01-29T00:00,Endeavor Air Inc.,ATL,EVV,939,1012.0,58.0,89.0,81.0,350.0,...,2.0,0900-0959,19.0,1031.0,1029.0,4.0,2,0.0,0,delayed
3,2018-01-06T00:00,Endeavor Air Inc.,BUF,DTW,1006,1043.0,39.0,84.0,77.0,241.0,...,2.0,1000-1059,29.0,1112.0,1151.0,9.0,1,0.0,0,delayed
4,2018-01-08T00:00,Endeavor Air Inc.,BTV,DTW,1240,1254.0,92.0,135.0,117.0,537.0,...,0.0,1200-1259,15.0,1309.0,1441.0,10.0,3,0.0,0,delayed


In [4]:
df_clean = df.drop(columns=['Flight_Number_Operating_Airline', 'OriginAirportID', 'OriginWac','DestWac','DestAirportID','CRSElapsedTime','AirTime','CRSDepTime','DepTime','FlightDate','Origin','Dest', 'DepartureDelayGroups','Distance'])

In [5]:
def clean_data(df):
    # Convert all columns of object type to to categorical
    cat_columns = df.select_dtypes(include=['object']).columns
    for col in cat_columns:
        if col == 'Dep_Delay':
            continue
        df[col] = df[col].astype('category')
    # One-hot encode specified columns
    df = pd.get_dummies(df, columns=cat_columns, drop_first=True, prefix=cat_columns, prefix_sep='_', dtype=int)
    return df


df_cleanmodel = clean_data(df_clean)

In [6]:
df_cleanmodel = df_cleanmodel.dropna()

In [7]:
df_cleanmodel.to_parquet('modeldata.parquet', index=False)

In [8]:
# apply a decision tree model to the cleaned data
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
X = df_cleanmodel.drop('Dep_Delay_not delayed', axis=1)
y = df_cleanmodel['Dep_Delay_not delayed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81    100592
           1       0.81      0.81      0.81    100490

    accuracy                           0.81    201082
   macro avg       0.81      0.81      0.81    201082
weighted avg       0.81      0.81      0.81    201082

