In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

#impore pipeline libraries
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [12]:
# Load the dataset
df = pd.read_csv('data/preprocessed_train_data.csv')
df_real_test = pd.read_csv('data/preprocessed_test_data.csv')

In [3]:
y = df['target']
X = df.drop(columns=['target'])

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
def flight_delay_pipline():
    # Define the categorical features
    num_col = ['duration']
    cat_col = ['departure_point', 'arrival_point', 'flight_status', 'aircraft_code']

    # Create the column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', num_col),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_col)
        ])

    model = XGBRegressor(objective='reg:squarederror', random_state=42)

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    return pipeline

def hyperparameter():
    param_distributions = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [3, 4, 5, 6],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__subsample': [0.6, 0.8, 1.0],
    'regressor__colsample_bytree': [0.6, 0.8, 1.0]}

    return param_distributions

def RandomizedSearch_function(X_train, y_train):
    pipeline = flight_delay_pipline()
    param_distributions = hyperparameter()

    random_search = RandomizedSearchCV(
        pipeline,
        param_distributions=param_distributions,
        n_iter=20,
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=1,
        n_jobs=-1,
        random_state=42)
        
    random_search.fit(X_train, y_train)

    return random_search

In [5]:
# Best pipeline
random_search = RandomizedSearch_function(X_train, y_train)
best_pipeline = random_search.best_estimator_

# Predict and evaluate
y_pred = best_pipeline.predict(X_test)

y_pred[y_pred < 0] = 0
y_test = y_test.clip(lower=0)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Best Params: {random_search.best_params_}")
print(f"Test RMSE: {rmse:.2f}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Params: {'regressor__subsample': 1.0, 'regressor__n_estimators': 100, 'regressor__max_depth': 5, 'regressor__learning_rate': 0.2, 'regressor__colsample_bytree': 0.8}
Test RMSE: 35.32


In [6]:
best_pipeline

In [7]:

y_pred_real_test = best_pipeline.predict(df_real_test)

In [8]:
y_pred_real_test[y_pred_real_test < 0] = 0


In [9]:
df_real_test

Unnamed: 0.1,Unnamed: 0,id,departure_point,arrival_point,departure_time,arrival_time,flight_status,aircraft_code,duration
0,0,test_id_0,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07:30:00,ATA,TU 32AIMF,3000.0
1,1,test_id_1,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20:05:00,ATA,TU 320IMW,17100.0
2,2,test_id_2,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12:25:00,ATA,TU 32AIMC,8700.0
3,3,test_id_3,BEY,TUN,2016-05-11 09:40:00,2016-05-11 13:10:00,ATA,TU 31BIMO,12600.0
4,4,test_id_4,ORY,MIR,2016-05-11 09:50:00,2016-05-11 12:35:00,ATA,TU 736IOQ,9900.0
...,...,...,...,...,...,...,...,...,...
9328,9328,test_id_9328,TUN,NCE,2018-09-12 14:15:00,2018-09-12 15:45:00,ATA,TU 320IMV,5400.0
9329,9329,test_id_9329,TUN,TUN,2018-09-27 22:00:00,2018-09-28 01:00:00,SCH,TU 32AIMG,10800.0
9330,9330,test_id_9330,SJJ,TUN,2018-09-03 09:20:00,2018-09-03 11:10:00,SCH,TU CR9ISA,6600.0
9331,9331,test_id_9331,TUN,DJE,2018-09-15 14:30:00,2018-09-15 15:30:00,SCH,UG AT7LBD,3600.0


In [10]:
y_pred_real_test

array([ 0.      , 45.49862 , 22.199095, ...,  0.      ,  0.      ,
        0.      ], dtype=float32)

In [14]:
df_pred_real_test

Unnamed: 0,id,prediction
0,test_id_0,0.000000
1,test_id_1,45.498619
2,test_id_2,22.199095
3,test_id_3,6.167076
4,test_id_4,14.571519
...,...,...
9328,test_id_9328,20.543535
9329,test_id_9329,0.000000
9330,test_id_9330,0.000000
9331,test_id_9331,0.000000


In [None]:
df_pred_real_test = pd.DataFrame({
    'id': df_real_test['id'],
    'prediction': y_pred_real_test
})

In [15]:
df_pred_real_test.head()

Unnamed: 0,id,prediction
0,test_id_0,0.0
1,test_id_1,45.498619
2,test_id_2,22.199095
3,test_id_3,6.167076
4,test_id_4,14.571519


In [17]:
df_pred_real_test.describe()

Unnamed: 0,prediction
count,9333.0
mean,16.240578
std,13.792314
min,0.0
25%,0.0
50%,18.565937
75%,24.064833
max,79.540634
