In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/Flight delays/flight_delays_train.csv")

In [3]:
df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [4]:
len(df["Distance"].unique())

1310

In [5]:
df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [6]:
df["UniqueCarrier"].unique()

array(['AA', 'US', 'XE', 'OO', 'WN', 'NW', 'DL', 'OH', 'AS', 'UA', 'MQ',
       'CO', 'EV', 'DH', 'YV', 'F9', 'AQ', 'TZ', 'HP', 'B6', 'FL', 'HA'],
      dtype=object)

In [7]:
df["Origin"].unique()

array(['ATL', 'PIT', 'RDU', 'DEN', 'MDW', 'MEM', 'PBI', 'MSP', 'ONT',
       'BDL', 'PHX', 'LAS', 'DFW', 'DSM', 'CMH', 'ORF', 'SLC', 'CLT',
       'GSO', 'IAD', 'SMF', 'FLL', 'DAL', 'ORD', 'ITO', 'SAN', 'ROA',
       'LGA', 'SFO', 'GSP', 'SEA', 'DAB', 'SJC', 'LIT', 'LAX', 'OAK',
       'COS', 'OKC', 'GRR', 'JFK', 'BOI', 'MCI', 'BWI', 'BHM', 'CRP',
       'BOS', 'SAT', 'PHL', 'STL', 'CIC', 'AUS', 'IAH', 'COD', 'HNL',
       'RNO', 'BNA', 'TPA', 'MIA', 'EVV', 'PNS', 'EWR', 'RSW', 'ANC',
       'SNA', 'AMA', 'CID', 'DTW', 'DCA', 'LGB', 'MAF', 'MFE', 'BMI',
       'PDX', 'IPL', 'GRB', 'FAR', 'HOU', 'MTJ', 'DRO', 'MLU', 'VPS',
       'TUL', 'CVG', 'SBA', 'PWM', 'IDA', 'MCO', 'ACV', 'CHS', 'BGM',
       'MSY', 'OGG', 'CLE', 'MOB', 'CAK', 'FAY', 'SHV', 'TUS', 'IND',
       'CAE', 'PVD', 'ROC', 'MFR', 'VLD', 'ELP', 'RIC', 'MKE', 'SGF',
       'TYS', 'CHO', 'EGE', 'BIS', 'JAN', 'JAX', 'BUF', 'MSO', 'BGR',
       'CEC', 'ICT', 'MYR', 'ALB', 'LIH', 'SBP', 'AEX', 'GNV', 'SAV',
       'BTM', 'BRO',

In [8]:
df.isna().mean()

Month                0.0
DayofMonth           0.0
DayOfWeek            0.0
DepTime              0.0
UniqueCarrier        0.0
Origin               0.0
Dest                 0.0
Distance             0.0
dep_delayed_15min    0.0
dtype: float64

In [9]:
def engineering(df):   
    
    df["Month"] = df["Month"].str.replace("c-", "")
    df["DayofMonth"] = df["DayofMonth"].str.replace("c-", "")
    df["DayOfWeek"] = df["DayOfWeek"].str.replace("c-", "")
    
    df["Month"] = df["Month"].astype(int)
    df["DayOfWeek"] = df["DayOfWeek"].astype(int)
    df["DayofMonth"] = df["DayofMonth"].astype(int)
    
    df["DepTime"] = df["DepTime"].astype(str).str.zfill(4)
    df["Time"] = pd.to_datetime(df["DepTime"], format="%H%M", errors="coerce")
    df["Hour"] = df["Time"].dt.hour
    df["Minute"] = df["Time"].dt.minute
    df["Time"] = df["Time"].fillna("00:00")
    df["Delayed"] = df["dep_delayed_15min"].map({"Y": 1, "N": 0})

    df.drop(["DepTime", "Time", "dep_delayed_15min"], axis=1, inplace=True)

    return df

In [10]:
df = engineering(df)

In [11]:
df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,Hour,Minute,Delayed
0,8,21,7,AA,ATL,DFW,732,19.0,34.0,0
1,4,20,3,US,PIT,MCO,834,15.0,48.0,0
2,9,2,5,XE,RDU,CLE,416,14.0,22.0,0
3,11,25,6,OO,DEN,MEM,872,10.0,15.0,0
4,10,7,6,WN,MDW,OMA,423,18.0,28.0,1


In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [13]:
df.shape

(100000, 10)

In [14]:
df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,Hour,Minute,Delayed
0,8,21,7,AA,ATL,DFW,732,19.0,34.0,0
1,4,20,3,US,PIT,MCO,834,15.0,48.0,0
2,9,2,5,XE,RDU,CLE,416,14.0,22.0,0
3,11,25,6,OO,DEN,MEM,872,10.0,15.0,0
4,10,7,6,WN,MDW,OMA,423,18.0,28.0,1


In [15]:
cat_labels = [c for c in df if not pd.api.types.is_numeric_dtype(df[c]) and c != "dep_delayed_15min"]

In [16]:
cat_labels

['UniqueCarrier', 'Origin', 'Dest']

In [17]:
category_idx = np.where(df.drop("Delayed", axis=1).dtypes == "object")[0]

In [18]:
category_idx

array([3, 4, 5], dtype=int64)

In [19]:
cat_imputer = Pipeline([
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

In [20]:
preprocessor = ColumnTransformer([
    ("imputer", cat_imputer, cat_labels)
], remainder="passthrough")

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X = df.drop("Delayed", axis=1)
y = df["Delayed"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [23]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

Lets try use catboost with native preprocessing

In [24]:
from catboost import CatBoostClassifier

In [25]:
model = CatBoostClassifier(verbose=False)

In [26]:
model.fit(X_train_preprocessed, y_train,
          early_stopping_rounds=25, verbose=False, 
          plot=True);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [27]:
from sklearn.metrics import roc_auc_score

In [28]:
y_preds_proba = model.predict_proba(X_test_preprocessed)[:, 1]

In [29]:
roc_auc_score(y_test, y_preds_proba)

0.7516555270996269