In [1]:
####Dependencies####

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer ##how we handle missing values
from sklearn.compose import ColumnTransformer ##Transform our data features
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
fw_data = pd.read_csv("/mnt/c/Users/tbrag/OneDrive/Desktop/bda602venv/rileys_data.csv")


# List of columns to keep
columns_to_keep = ['AIRLINE','ORIGIN','DEST','DISTANCE','ori_TMIN','ori_TMAX','ori_SNOW','ori_SNWD','ori_AWND','ori_PRCP','dest_TMIN','dest_TMAX','dest_SNOW','dest_SNWD','dest_AWND','dest_PRCP','15_DELAYED','year','month','day','hour','day_of_week']

# Drop all columns except the specified ones
fw_data_final= fw_data.drop(columns=fw_data.columns.difference(columns_to_keep))

fw_data_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1255864 entries, 0 to 1255863
Data columns (total 22 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   AIRLINE      1255864 non-null  object 
 1   ORIGIN       1255864 non-null  object 
 2   DEST         1255864 non-null  object 
 3   DISTANCE     1255864 non-null  float64
 4   ori_TMIN     1255864 non-null  float64
 5   ori_TMAX     1255864 non-null  float64
 6   ori_SNOW     1255864 non-null  float64
 7   ori_SNWD     1255864 non-null  float64
 8   ori_AWND     1255864 non-null  float64
 9   ori_PRCP     1255864 non-null  float64
 10  dest_TMIN    1255864 non-null  float64
 11  dest_TMAX    1255864 non-null  float64
 12  dest_SNOW    1255864 non-null  float64
 13  dest_SNWD    1255864 non-null  float64
 14  dest_AWND    1255864 non-null  float64
 15  dest_PRCP    1255864 non-null  float64
 16  15_DELAYED   1255864 non-null  int64  
 17  year         1255864 non-null  int64  
 18  mo

In [3]:
#extract Numeric Features
numeric_features = fw_data_final.select_dtypes(
    include=["float","int"]
)

categorical_features = fw_data_final.iloc[:,~fw_data_final.columns.isin(numeric_features.columns.values)]



In [4]:
# Define a function to replace infrequent categories with 'Other'
def replace_infrequent_categories(df, categorical_cols, threshold=0.05, other_label='Other'):
    for col in categorical_cols:
        counts = df[col].value_counts(normalize=True)
        infrequent_categories = counts[counts < threshold].index
        df[col] = df[col].replace(infrequent_categories, other_label)
    return df


# Replace infrequent categories with 'Other'
df_pipeline = replace_infrequent_categories(fw_data_final, categorical_features, threshold= 0.01)

df_pipeline
df_pipeline.head()

Unnamed: 0,AIRLINE,ORIGIN,DEST,DISTANCE,ori_TMIN,ori_TMAX,ori_SNOW,ori_SNWD,ori_AWND,ori_PRCP,...,dest_SNOW,dest_SNWD,dest_AWND,dest_PRCP,15_DELAYED,year,month,day,day_of_week,hour
0,Spirit Air Lines,MCO,DFW,985.0,117.0,256.0,0.0,0.0,32.0,0.0,...,0.0,0.0,55.0,8.0,0,2020,2,23,6,18
1,Delta Air Lines Inc.,ATL,Other,859.0,222.0,294.0,0.0,0.0,35.0,122.0,...,0.0,0.0,30.0,0.0,1,2021,6,11,4,18
2,American Airlines Inc.,RDU,DFW,1061.0,250.0,333.0,0.0,0.0,40.0,0.0,...,0.0,0.0,45.0,0.0,0,2020,7,31,4,12
3,Delta Air Lines Inc.,Other,ATL,859.0,39.0,67.0,0.0,0.0,35.0,3.0,...,0.0,0.0,36.0,0.0,0,2019,11,20,2,6
4,Southwest Airlines Co.,BWI,Other,283.0,67.0,189.0,0.0,0.0,20.0,56.0,...,0.0,0.0,26.0,0.0,0,2022,5,1,6,17


In [5]:

X = fw_data_final.iloc[:,~fw_data_final.columns.isin(["15_DELAYED"])]
y = fw_data_final["15_DELAYED"]

X_train, X_test, y_train, y_test = train_test_split(
    X,y,
    shuffle=True,
    random_state=777,
    test_size=0.15
)
X_train.head(2)

Unnamed: 0,AIRLINE,ORIGIN,DEST,DISTANCE,ori_TMIN,ori_TMAX,ori_SNOW,ori_SNWD,ori_AWND,ori_PRCP,...,dest_TMAX,dest_SNOW,dest_SNWD,dest_AWND,dest_PRCP,year,month,day,day_of_week,hour
1040677,United Air Lines Inc.,Other,IAD,1571.0,228.0,294.0,0.0,0.0,12.0,0.0,...,83.0,0.0,0.0,8.0,0.0,2023,1,10,1,18
528977,Republic Airline,ORD,MCI,403.0,133.0,278.0,0.0,0.0,52.0,188.0,...,311.0,0.0,0.0,29.0,0.0,2021,5,26,2,15


In [6]:
numeric_columns =numeric_features.drop(columns=["15_DELAYED"]).columns.values
categorical_columns = categorical_features.columns.values
numeric_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer()),
        ("standardization", StandardScaler())

    ]
)

In [7]:
categorical_pipe = Pipeline(
    steps=[
        ("one-hot-encoding", OneHotEncoder(handle_unknown="ignore"))
    ]
)

In [29]:
tree= DecisionTreeClassifier(class_weight = 'balanced')

preprocessing_pipe = ColumnTransformer(
    [
        ("numeric_pipe",numeric_pipe,numeric_columns),
        ("categorical_pipe", categorical_pipe, categorical_columns)
    ]
)
model = Pipeline(
    [("Preprocessing",preprocessing_pipe),
     ("regression_model", tree)

    ]
    
)

In [30]:
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

KeyboardInterrupt: 

In [10]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Generate classification report
report = classification_report(y_test, y_pred)

In [11]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Classification Report:\n", report)

Accuracy: 0.7099108185582333
Precision: 0.34292357555545117
Recall: 0.35378352838554916
F1-score: 0.34826891197271287
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.81      0.81    147109
           1       0.34      0.35      0.35     41271

    accuracy                           0.71    188380
   macro avg       0.58      0.58      0.58    188380
weighted avg       0.71      0.71      0.71    188380



In [12]:
features = model[:-1].get_feature_names_out()
importances = tree.feature_importances_
imp_df = pd.DataFrame({'Feature':features, 'Importance': importances})
imp_df = imp_df.sort_values(by="Importance", ascending= False)
imp_df.head(50)



Unnamed: 0,Feature,Importance
17,numeric_pipe__hour,0.087325
0,numeric_pipe__DISTANCE,0.086801
11,numeric_pipe__dest_AWND,0.076935
5,numeric_pipe__ori_AWND,0.072088
2,numeric_pipe__ori_TMAX,0.06271
7,numeric_pipe__dest_TMIN,0.062088
8,numeric_pipe__dest_TMAX,0.062042
1,numeric_pipe__ori_TMIN,0.060766
15,numeric_pipe__day,0.05945
14,numeric_pipe__month,0.033328


In [22]:
tree= DecisionTreeClassifier()

preprocessing_pipe = ColumnTransformer(
    [
        ("numeric_pipe",numeric_pipe,numeric_columns),
        ("categorical_pipe", categorical_pipe, categorical_columns)
    ]
)
model = Pipeline(
    [("Preprocessing",preprocessing_pipe),
     ("regression_model", tree)

    ]
    
)

from sklearn.model_selection import RandomizedSearchCV
import scipy

hyperparam_grid = {
    "regression_model__max_depth": scipy.stats.randint(3, 30),
    "regression_model__min_samples_split": scipy.stats.randint(3, 30),
   
}

ensemble_cv = RandomizedSearchCV(
    model,
    hyperparam_grid,
    n_iter=10,
    cv=5,
    n_jobs = -1,
    random_state=777
)

ensemble_cv



In [9]:

# Fit the ensemble_cv object to the training data
ensemble_cv.fit(X_train, y_train)



In [10]:

cv_results= ensemble_cv.cv_results_

cv_results_df = pd.DataFrame(
    cv_results
)
cv_results_df.sort_values(by=["rank_test_score"], ascending= True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regression_model__max_depth,param_regression_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,39.682478,0.436551,0.337308,0.013913,10,21,"{'regression_model__max_depth': 10, 'regressio...",0.78417,0.784376,0.784999,0.784587,0.784483,0.784523,0.000275,1
0,42.802107,2.51566,0.412039,0.057433,10,18,"{'regression_model__max_depth': 10, 'regressio...",0.784147,0.7844,0.784962,0.784578,0.78446,0.784509,0.000267,2
1,31.651211,0.313702,0.356813,0.023091,9,26,"{'regression_model__max_depth': 9, 'regression...",0.784077,0.783374,0.784484,0.783861,0.784235,0.784006,0.000376,3
8,6.638815,0.059274,0.332931,0.013078,3,4,"{'regression_model__max_depth': 3, 'regression...",0.780283,0.780278,0.780278,0.780278,0.780282,0.78028,2e-06,4
7,125.004568,1.667475,0.34669,0.015661,16,17,"{'regression_model__max_depth': 16, 'regressio...",0.779524,0.778756,0.778217,0.778348,0.779485,0.778866,0.000551,5
2,204.955072,2.225578,0.344408,0.008253,20,10,"{'regression_model__max_depth': 20, 'regressio...",0.769275,0.768821,0.768292,0.767856,0.768263,0.768501,0.000494,6
9,212.97091,2.667021,0.379506,0.019865,21,23,"{'regression_model__max_depth': 21, 'regressio...",0.767959,0.76698,0.767013,0.76536,0.766408,0.766744,0.000853,7
3,261.962967,2.444733,0.357659,0.012269,23,10,"{'regression_model__max_depth': 23, 'regressio...",0.758535,0.757509,0.759074,0.757528,0.759419,0.758413,0.000783,8
5,318.522369,3.85074,0.382641,0.011333,29,27,"{'regression_model__max_depth': 29, 'regressio...",0.750118,0.750854,0.75039,0.749359,0.750347,0.750214,0.000489,9
4,320.351296,3.634713,0.38681,0.015054,28,17,"{'regression_model__max_depth': 28, 'regressio...",0.747018,0.746676,0.747575,0.746128,0.748103,0.7471,0.000688,10


In [15]:
import numpy as np

optimal_model = ensemble_cv.best_estimator_
y_pred = optimal_model.predict(X_test)


optimal_model

In [16]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Generate classification report
report = classification_report(y_test, y_pred)

In [17]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Classification Report:\n", report)


Accuracy: 0.7860760165622678
Precision: 0.5556701030927835
Recall: 0.11754016137239223
F1-score: 0.19403611927761444
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.97      0.88    147109
           1       0.56      0.12      0.19     41271

    accuracy                           0.79    188380
   macro avg       0.68      0.55      0.54    188380
weighted avg       0.74      0.79      0.73    188380

