In [27]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/my_dataset/shuffled_preprocessed_flight_data_100k.csv")
display(df.head())

Unnamed: 0,year,month,day_of_month,day_of_week,fl_date,op_unique_carrier,op_carrier_fl_num,origin,origin_city_name,origin_state_nm,...,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,is_dep_delayed,is_arr_delayed
0,2024,4,19,5,2024-04-19,AA,2491.0,STT,"Charlotte Amalie, VI",U.S. Virgin Islands,...,184.0,163.0,1107.0,0,0,0,0,0,False,False
1,2024,5,20,1,2024-05-20,UA,1321.0,EWR,"Newark, NJ",New Jersey,...,189.0,149.0,1085.0,0,0,0,0,0,False,False
2,2024,7,9,2,2024-07-09,WN,3613.0,HOU,"Houston, TX",Texas,...,53.0,38.0,192.0,0,0,0,0,26,True,True
3,2024,9,24,2,2024-09-24,AS,453.0,IAH,"Houston, TX",Texas,...,284.0,252.0,1874.0,0,0,0,0,0,False,False
4,2024,5,31,5,2024-05-31,AA,2847.0,LGA,"New York, NY",New York,...,222.0,185.0,1389.0,0,0,0,0,0,True,True


In [28]:
from typing import Tuple, List


FEATURES = [
    "month",
    "day_of_month",
    "day_of_week",
    "op_unique_carrier",
    "origin",
    "origin_city_name",
    "origin_state_nm",
    "dest",
    "dest_city_name",
    "dest_state_nm",
    "dep_time",
    "distance"
]

TARGET = "arr_delay"


def load_preprocessed_data(df, features: List[str] = FEATURES) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Load the preprocessed flight data CSV and return X (features DataFrame) and y (target Series).

    This function returns the selected feature columns and the target column as-is from the
    preprocessed CSV. It does not perform any encoding or scaling — that should be handled by
    the caller so that preprocessing choices remain explicit.

    Parameters:
        path: Path to the preprocessed CSV file (default: ./data/preprocessed_flight_data.csv)
        features: List of feature column names to select (default: FEATURES)

    Returns:
        X: pd.DataFrame with selected feature columns (numeric)
        y: pd.Series with the target values
    """

    missing = set(features + [TARGET]) - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns in CSV: {missing}")

    X = df[features]
    y = df[TARGET]
    return X, y


__all__ = ["load_preprocessed_data", "FEATURES", "TARGET"]


In [29]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
import numpy as np

def evaluate_model(model, X_test, Y_test):
  y_pred = model.predict(X_test)

  mae = mean_absolute_error(Y_test, y_pred)
  mse = mean_squared_error(Y_test, y_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(Y_test, y_pred)

  print(f"Model Evaluation Metrics:")
  print(f"  R-squared (R²): {r2:.4f}")
  print(f"  Mean Absolute Error (MAE): {mae:.4f}")
  print(f"  Mean Squared Error (MSE): {mse:.4f}")
  print(f"  Root Mean Squared Error (RMSE): {rmse:.4f}")

  return {
    'r2': r2,
    'mae': mae,
    'mse': mse,
    'rmse': rmse
  }




In [30]:
X, Y = load_preprocessed_data(df)

print(X[:2])
print(Y[:2])

   month  day_of_month  day_of_week op_unique_carrier origin  \
0      4            19            5                AA    STT   
1      5            20            1                UA    EWR   

       origin_city_name      origin_state_nm dest dest_city_name  \
0  Charlotte Amalie, VI  U.S. Virgin Islands  MIA      Miami, FL   
1            Newark, NJ           New Jersey  MIA      Miami, FL   

  dest_state_nm  dep_time  distance  
0       Florida  12:11:00    1107.0  
1       Florida  08:50:00    1085.0  
0   -12.0
1   -14.0
Name: arr_delay, dtype: float64


In [31]:
import pandas as pd
import numpy as np


def hhmm_to_minutes(time_val):
    if pd.isna(time_val):
        return np.nan

    try:
        if isinstance(time_val, str) and ':' in time_val:
            parts = time_val.split(':')
            hour = int(parts[0])
            minute = int(parts[1])

            if hour == 24:
                return 0

            return hour * 60 + minute
        s = str(int(float(time_val))).zfill(4)
        if s == '2400':
            return 0
        hour = int(s[:2])
        minute = int(s[2:])
        return hour * 60 + minute

    except (ValueError, TypeError, IndexError):
        return np.nan


cols_to_drop = ['origin_city_name', 'origin_state_nm', 'dest_city_name', 'dest_state_nm']
cols_to_drop_existing = [col for col in cols_to_drop if col in X.columns]
if cols_to_drop_existing:
    X = X.drop(columns=cols_to_drop_existing)


if 'dep_time' in X.columns:
    X['dep_time_minutes'] = X['dep_time'].apply(hhmm_to_minutes)

    X['dep_time_sin'] = np.sin(2 * np.pi * X['dep_time_minutes'] / 1440.0)
    X['dep_time_cos'] = np.cos(2 * np.pi * X['dep_time_minutes'] / 1440.0)
    X = X.drop(columns=['dep_time', 'dep_time_minutes'])

if 'month' in X.columns:
    X['month_sin'] = np.sin(2 * np.pi * X['month'] / 12.0)
    X['month_cos'] = np.cos(2 * np.pi * X['month'] / 12.0)
    X = X.drop(columns=['month'])

if 'day_of_week' in X.columns:
    X['day_of_week_sin'] = np.sin(2 * np.pi * X['day_of_week'] / 7.0)
    X['day_of_week_cos'] = np.cos(2 * np.pi * X['day_of_week'] / 7.0)
    X = X.drop(columns=['day_of_week'])

if 'day_of_month' in X.columns:
    X['day_of_month_sin'] = np.sin(2 * np.pi * X['day_of_month'] / 31.0)
    X['day_of_month_cos'] = np.cos(2 * np.pi * X['day_of_month'] / 31.0)
    X = X.drop(columns=['day_of_month'])


categorical_cols = ['op_unique_carrier', 'origin', 'dest']
for col in categorical_cols:
    if col in X.columns:
        X[col] = X[col].astype('category')


In [32]:
print(X[:5])

  op_unique_carrier origin dest  distance  dep_time_sin  dep_time_cos  \
0                AA    STT  MIA    1107.0     -0.047978     -0.998848   
1                UA    EWR  MIA    1085.0      0.737277     -0.675590   
2                WN    HOU  SAT     192.0     -0.450098      0.892979   
3                AS    IAH  SEA    1874.0      0.997250      0.074108   
4                AA    LGA  DFW    1389.0     -0.207912     -0.978148   

   month_sin     month_cos  day_of_week_sin  day_of_week_cos  \
0   0.866025 -5.000000e-01        -0.974928        -0.222521   
1   0.500000 -8.660254e-01         0.781831         0.623490   
2  -0.500000 -8.660254e-01         0.974928        -0.222521   
3  -1.000000 -1.836970e-16         0.974928        -0.222521   
4   0.500000 -8.660254e-01        -0.974928        -0.222521   

   day_of_month_sin  day_of_month_cos  
0     -6.513725e-01         -0.758758  
1     -7.907757e-01         -0.612106  
2      9.680771e-01         -0.250653  
3     -9.884683e

In [33]:
X.columns

Index(['op_unique_carrier', 'origin', 'dest', 'distance', 'dep_time_sin',
       'dep_time_cos', 'month_sin', 'month_cos', 'day_of_week_sin',
       'day_of_week_cos', 'day_of_month_sin', 'day_of_month_cos'],
      dtype='object')

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3, random_state=42)

In [35]:
from xgboost import XGBRegressor

model = XGBRegressor(objective='reg:squarederror',
                      n_estimators=100,
                     random_state=42,
                     enable_categorical=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

evaluate_model(model, X_test, y_test)


Model Evaluation Metrics:
  R-squared (R²): -0.0185
  Mean Absolute Error (MAE): 27.2959
  Mean Squared Error (MSE): 3333.9987
  Root Mean Squared Error (RMSE): 57.7408


{'r2': -0.01851171748288971,
 'mae': 27.295881834726284,
 'mse': 3333.9986660776863,
 'rmse': np.float64(57.74078858205598)}

In [39]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error

param_dist = {
    'n_estimators': [100, 300, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}


base_model = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    enable_categorical=True
)


random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=25,
    cv=3,
    scoring='neg_mean_absolute_error', # minimize mae
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best found params:")
print(random_search.best_params_)

print("\nmodel evaluation:")
best_model = random_search.best_estimator_
evaluate_model(best_model, X_test, y_test)

Fitting 3 folds for each of 25 candidates, totalling 75 fits




Best found params:
{'subsample': 0.7, 'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.01, 'colsample_bytree': 0.8}

model evaluation:
Model Evaluation Metrics:
  R-squared (R²): 0.0591
  Mean Absolute Error (MAE): 24.9206
  Mean Squared Error (MSE): 3080.0567
  Root Mean Squared Error (RMSE): 55.4983


{'r2': 0.059065670884130395,
 'mae': 24.92055935703451,
 'mse': 3080.056659428381,
 'rmse': np.float64(55.498258165715264)}