In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
import lightgbm as lgb
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv("final_data.csv")
data['Year']=data['Year'].astype('category')
data['Month']=data['Month'].astype('category')
data['DayOfWeek']=data['DayOfWeek'].astype('category')
data['Operating_Airline ']=data['Operating_Airline '].astype('category')
data['Origin']=data['Origin'].astype('category')
data['Dest']=data['Dest'].astype('category')
data['DepTimeBlk'] = data['DepTimeBlk'].astype('category')
data['ArrTimeBlk'] = data['ArrTimeBlk'].astype('category')
data['Origin_HourlyWindDirection'] = data['Origin_HourlyWindDirection'].astype('category')
data['Dest_HourlyWindDirection'] = data['Dest_HourlyWindDirection'].astype('category')
data['Holiday'] = data['Holiday'].astype('category')
data['Cancelled'] = data['Cancelled'].astype('category')
data = data.drop(columns=['ActualElapsedTime'])

continuous_columns = data.select_dtypes(include=['float64', 'int64']).columns

data = data[data['Cancelled'] != 1]
min_arr_delay = data['ArrDelay'].min()
data = data[data['ArrDelay'] != min_arr_delay]  # Remove row with minimum ArrDelay
data = data.dropna(subset=['ArrDelay'])

In [3]:
def categorize_delay(delay):
    if delay <= 0:
        return "OnTime"
    elif delay <= 30:
        return "SlightDelay"
    else:
        return "SevereDelay"

data['DelayCategory'] = data['ArrDelay'].apply(categorize_delay)
data['DelayCategory'] = data['DelayCategory'].astype('category')

In [4]:
###data = data.sample(frac=0.2, random_state=42)

X = data.drop(columns=['Cancelled', 'DepDelay', 'ArrDelay','DelayCategory'])
y_class = data['DelayCategory']  
y_reg = data['ArrDelay']  


X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    X, y_class, y_reg, test_size=0.2, random_state=42
)
categorical_features = X_train.select_dtypes(include=['category']).columns.tolist()

In [5]:
lgb_classifier = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=len(y_class.unique()),  
    metric='multi_logloss',
    learning_rate=0.1,
    num_leaves=31,
    max_depth=-1,
    random_state=42
)
lgb_classifier.fit(X_train, y_class_train, categorical_feature=categorical_features)

y_class_pred = lgb_classifier.predict(X_test)

accuracy = accuracy_score(y_class_test, y_class_pred)
print("Classification Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_class_test, y_class_pred))

X_test['PredictedCategory'] = y_class_pred  
results = {}
all_y_true = []
all_y_pred = []

for category in y_class_train.unique():
    # data
    X_train_cat = X_train[y_class_train == category]
    y_train_cat = y_reg_train[y_class_train == category]
    
    X_test_cat = X_test[X_test['PredictedCategory'] == category].drop(columns=['PredictedCategory'])
    y_test_cat = y_reg_test[X_test['PredictedCategory'] == category]
    
    # check
    if len(X_train_cat) == 0 or len(X_test_cat) == 0:
        print(f"Skipping category {category} due to insufficient data.")
        continue

    # LightGBM for regression
    lgb_regressor = lgb.LGBMRegressor(
        objective='regression',
        metric='rmse',
        learning_rate=0.1,
        num_leaves=31,
        max_depth=-1,
        random_state=42
    )
    lgb_regressor.fit(X_train_cat, y_train_cat, categorical_feature=categorical_features)
    
    # regrsesion predict
    y_pred_cat = lgb_regressor.predict(X_test_cat)
    all_y_true.extend(y_test_cat)
    all_y_pred.extend(y_pred_cat)
    
    # evaluate
    mse = mean_squared_error(y_test_cat, y_pred_cat)
    mae = mean_absolute_error(y_test_cat, y_pred_cat)
    r2 = r2_score(y_test_cat, y_pred_cat)
    
    results[category] = {
        "Mean Squared Error": mse,
        "Mean Absolute Error": mae,
        "R² Score": r2
    }
    print(f"Results for category '{category}':")
    print(f"  MSE: {mse}")
    print(f"  MAE: {mae}")
    print(f"  R²: {r2}")
    print("-----")

# print output
print("All results:", results)
overall_mse = mean_squared_error(all_y_true, all_y_pred)
overall_rmse = np.sqrt(overall_mse)
overall_mae = mean_absolute_error(all_y_true, all_y_pred)
overall_r2 = r2_score(all_y_true, all_y_pred)
print("Overall Metrics:")
print("Mean Squared Error (MSE):", overall_mse)
print("Root Mean Squared Error (RMSE):", overall_rmse)
print("Mean Absolute Error (MAE):", overall_mae)
print("R² Score:", overall_r2)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.481886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4623
[LightGBM] [Info] Number of data points in the train set: 7607753, number of used features: 30
[LightGBM] [Info] Start training from score -0.402052
[LightGBM] [Info] Start training from score -2.239300
[LightGBM] [Info] Start training from score -1.493787
Classification Accuracy: 0.6763176947315345
Classification Report:
              precision    recall  f1-score   support

      OnTime       0.68      0.99      0.81   1272279
 SevereDelay       0.56      0.08      0.14    203697
 SlightDelay       0.43      0.02      0.03    425963

    accuracy                           0.68   1901939
   macro avg       0.56      0.36      0.33   1901939
weighted avg       0.61      0.68      0.56   1901939

[LightGBM] [Info] Auto-choosing col