In [None]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

#impore pipeline libraries
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [None]:
# Load the dataset
df = pd.read_csv('data/preprocessed_train_data.csv')
df_real_test = pd.read_csv('data/preprocessed_test_data.csv')

In [None]:
df.columns

In [None]:
df_real_test.columns

In [None]:
y = df['target']
X = df.drop(columns=['target','departure_date',
       'arrival_date'])

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_train.columns

In [None]:
def flight_delay_pipline():
    # Define the categorical features
    num_col = ['duration','dep_temp', 'dep_precip', 'dep_wind', 'arr_temp',
       'arr_precip', 'arr_wind']
    cat_col = ['departure_point', 'arrival_point', 'flight_status', 'aircraft_code','dep_hour',
       'dep_day', 'dep_month', 'dep_dayofweek', 'dep_quarter', 'dep_season',
       'dep_is_weekend', 'dep_time_of_day', 'arr_hour', 'arr_day', 'arr_month',
       'arr_dayofweek', 'arr_quarter', 'arr_season', 'arr_is_weekend',
       'arr_time_of_day', 'route']

    # Create the column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', num_col),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_col)
        ])

    model = XGBRegressor(objective='reg:squarederror', random_state=42)

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    return pipeline

def hyperparameter():
    param_distributions = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [3, 4, 5, 6],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__subsample': [0.6, 0.8, 1.0],
    'regressor__colsample_bytree': [0.6, 0.8, 1.0]}

    return param_distributions

def RandomizedSearch_function(X_train, y_train):
    pipeline = flight_delay_pipline()
    param_distributions = hyperparameter()

    random_search = RandomizedSearchCV(
        pipeline,
        param_distributions=param_distributions,
        n_iter=20,
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=1,
        n_jobs=-1,
        random_state=42)
        
    random_search.fit(X_train, y_train)

    return random_search

In [None]:
# Best pipeline
random_search = RandomizedSearch_function(X_train, y_train)
best_pipeline = random_search.best_estimator_

# Predict and evaluate
y_pred = best_pipeline.predict(X_test)

y_pred[y_pred < 0] = 0
y_test = y_test.clip(lower=0)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Best Params: {random_search.best_params_}")
print(f"Test RMSE: {rmse:.2f}")

In [None]:
best_pipeline

In [None]:
from sklearn.compose import make_column_selector
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

# After training your model and getting best_pipeline:
best_pipeline = random_search.best_estimator_
num_col = ['duration','dep_temp', 'dep_precip', 'dep_wind', 'arr_temp',
    'arr_precip', 'arr_wind']
cat_col = ['departure_point', 'arrival_point', 'flight_status', 'aircraft_code','dep_hour',
    'dep_day', 'dep_month', 'dep_dayofweek', 'dep_quarter', 'dep_season',
    'dep_is_weekend', 'dep_time_of_day', 'arr_hour', 'arr_day', 'arr_month',
    'arr_dayofweek', 'arr_quarter', 'arr_season', 'arr_is_weekend',
    'arr_time_of_day', 'route']
# Step 1: Extract preprocessed feature names
preprocessor = best_pipeline.named_steps['preprocessor']
# Get names from one-hot encoder
ohe = preprocessor.named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(cat_col)
# Combine with numerical names
feature_names = np.concatenate([num_col, ohe_feature_names])

# Step 2: Get feature importances from XGBoost model
booster = best_pipeline.named_steps['regressor']
importances = booster.feature_importances_
importance_series = pd.Series(importances, index=feature_names).sort_values(ascending=False)

# Step 3: Show top features (change N to control selection)
N = 380
top_features = importance_series.head(N).index.tolist()
print("Top features:\n", top_features)

# Step 4: Rebuild a new pipeline with only selected features
# Get numeric and categorical again
selected_num = [f for f in num_col if f in top_features]
selected_cat = [col for col in cat_col if any(col in f for f in top_features)]

# New preprocessor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

new_preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', selected_num),
        ('cat', OneHotEncoder(handle_unknown='ignore'), selected_cat)
    ])

# Rebuild pipeline
best_model = XGBRegressor(objective='reg:squarederror', **random_search.best_params_)

reduced_pipeline = Pipeline(steps=[
    ('preprocessor', new_preprocessor),
    ('regressor', best_model)
])

# Step 5: Retrain on full training set with selected features
reduced_pipeline.fit(X_train, y_train)

# Step 6: Evaluate
y_pred = reduced_pipeline.predict(X_test)
y_pred[y_pred < 0] = 0
y_test = y_test.clip(lower=0)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("✅ Retrained with top features only")
print(f"Reduced RMSE: {rmse:.2f}")

In [None]:

y_pred_real_test = best_pipeline.predict(df_real_test)

In [None]:
y_pred_real_test[y_pred_real_test < 0] = 0


In [None]:
df_real_test

In [None]:
y_pred_real_test

In [None]:
df_pred_real_test = pd.DataFrame({
    'id': df_real_test['id'],
    'prediction': y_pred_real_test
})

In [None]:
df_pred_real_test.head()

In [None]:
df_pred_real_test.describe()

In [None]:
df_pred_real_test.to_csv('data/flight_predictions_33_xgb.csv', index=False)