In [60]:
import pandas as pd

#Plot
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import calendar 
import calplot # actually used

# Score model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Model
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from xgboost import XGBRegressor
from scipy import stats



# Dataframe setup

In [59]:
# Read the data
url = "https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/main/ProjectAssignmentData/Dataset-PT.csv"
df = pd.read_csv(url, header=1)
#df = df.drop(columns=['weather', 'temperature', 'day_of_week', 'time_of_day'])
# Calculate z-scores for the 'arrival_delay' column
z_scores = stats.zscore(df['arrival_delay'])

# Get boolean array indicating the location of outliers
outliers = (z_scores > 7) | (z_scores < -7)

# Count the number of outliers
num_outliers = outliers.sum()

# Print the number of outliers
print(f"Number of outliers removed: {num_outliers}")

# Remove the outliers
df = df[~outliers]

# Verify the new size of the DataFrame
print(f"Size of the original DataFrame: {len(df)}")
#print(f"Size of the DataFrame after removing outliers: {len(df_no_outliers)}")


Number of outliers removed: 228
Size of the original DataFrame: 544875


# stop and daytime model (LR)

In [54]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
import joblib
import os

def train_evaluate_lr_model(df, n_folds=5):
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    results = {
        'stop_sequence': [],
        'day_type': [],
        'R^2': [],
        'MAE': []
    }

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            print(f"Processing stop_sequence {stop_seq} for {day_type}...")
            
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]

            drop_columns = ['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"]
            x_train = df_train_subset.drop(columns=drop_columns, errors='ignore').astype('float32')
            y_train = df_train_subset['arrival_delay'].astype('float32')

            r2_scores_lr = []
            maes_lr = []

            # K-fold CV
            kf = KFold(n_splits=n_folds)
            for fold, (train_index, val_index) in enumerate(kf.split(x_train)):
                print(f"Running Fold {fold + 1}...")

                x_train_fold, x_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
                y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                # Train and evaluate LR
                print("Training Linear Regression...")
                model_lr = LinearRegression().fit(x_train_fold, y_train_fold)
                y_pred_lr = model_lr.predict(x_val_fold)
                r2_scores_lr.append(r2_score(y_val_fold, y_pred_lr))
                maes_lr.append(mean_absolute_error(y_val_fold, y_pred_lr))

            # Save LR model
            best_r2 = np.mean(r2_scores_lr)
            best_mae = np.mean(maes_lr)
            if not os.path.exists("lr_models"):
                os.makedirs("lr_models")
            joblib.dump(model_lr, os.path.join("lr_models", f'lr_model_{stop_seq}_{day_type}.pkl'))

            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['R^2'].append(best_r2)
            results['MAE'].append(best_mae)

    return pd.DataFrame(results)

def predict_with_lr_model(df):
    all_groups = []

    grouped = df.groupby(['stop_sequence', 'day_of_week'])
    for (stop_seq, day_type), group in grouped:
        lr_model_path = os.path.join("lr_models", f'lr_model_{stop_seq}_{day_type}.pkl')

        X = group.drop(columns=['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], errors='ignore').astype('float32')
        
        if os.path.exists(lr_model_path):
            model = joblib.load(lr_model_path)
            group['predicted_delay'] = model.predict(X)
        else:
            raise ValueError(f"No saved model found for stop_sequence: {stop_seq} and day_type: {day_type}")

        all_groups.append(group)

    result_df = pd.concat(all_groups)
    return result_df


In [55]:
results_lr = train_evaluate_lr_model(df)
print(results_lr)

Processing stop_sequence 1 for weekday...
Running Fold 1...
Training Linear Regression...
Running Fold 2...
Training Linear Regression...
Running Fold 3...
Training Linear Regression...
Running Fold 4...
Training Linear Regression...
Running Fold 5...
Training Linear Regression...
Processing stop_sequence 1 for weekend...
Running Fold 1...
Training Linear Regression...
Running Fold 2...
Training Linear Regression...
Running Fold 3...
Training Linear Regression...
Running Fold 4...
Training Linear Regression...
Running Fold 5...
Training Linear Regression...
Processing stop_sequence 2 for weekday...
Running Fold 1...
Training Linear Regression...
Running Fold 2...
Training Linear Regression...
Running Fold 3...
Training Linear Regression...
Running Fold 4...
Training Linear Regression...
Running Fold 5...
Training Linear Regression...
Processing stop_sequence 2 for weekend...
Running Fold 1...
Training Linear Regression...
Running Fold 2...
Training Linear Regression...
Running Fold 3..

Running Fold 1...
Training Linear Regression...
Running Fold 2...
Training Linear Regression...
Running Fold 3...
Training Linear Regression...
Running Fold 4...
Training Linear Regression...
Running Fold 5...
Training Linear Regression...
Processing stop_sequence 16 for weekday...
Running Fold 1...
Training Linear Regression...
Running Fold 2...
Training Linear Regression...
Running Fold 3...
Training Linear Regression...
Running Fold 4...
Training Linear Regression...
Running Fold 5...
Training Linear Regression...
Processing stop_sequence 16 for weekend...
Running Fold 1...
Training Linear Regression...
Running Fold 2...
Training Linear Regression...
Running Fold 3...
Training Linear Regression...
Running Fold 4...
Training Linear Regression...
Running Fold 5...
Training Linear Regression...
Processing stop_sequence 17 for weekday...
Running Fold 1...
Training Linear Regression...
Running Fold 2...
Training Linear Regression...
Running Fold 3...
Training Linear Regression...
Running

In [57]:
pred = predict_with_lr_model(df)
pred.head()
# Extracting the actual and predicted values from the DataFrame
actual_values = pred["arrival_delay"].values
predicted_values = pred["predicted_delay"].values

# Computing R^2
r2 = r2_score(actual_values, predicted_values)

# Computing MAE
mae = mean_absolute_error(actual_values, predicted_values)

print(f"R^2: {r2:f}")
print(f"MAE: {mae:f}")



R^2: 0.990044
MAE: 12.214730


# Only Daytype Model

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFECV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd

def train_and_evaluate(df):
    # Split the dataset
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    # Results placeholder
    results = {
        'day_type': [],
        'best_model': [],
        'MSE': [],
        'MAE': [],
        'R^2': []  
    }

    # Function to train and evaluate Linear Regression with RFECV
    def train_lr_model(df_train_subset):
        x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
        y_train = df_train_subset['arrival_delay']

        model = LinearRegression()
        selector = RFECV(estimator=model, step=1, cv=KFold(5))
        selector = selector.fit(x_train, y_train)

        model.fit(x_train.iloc[:, selector.support_], y_train)
        
        return model, selector

    # Function to train Neural Network
    def train_nn_model(df_train_subset):
        x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
        y_train = df_train_subset['arrival_delay']

        model_nn = Sequential([
            Dense(32, activation='relu', input_dim=x_train.shape[1]),
            Dropout(0.001),
            Dense(64, activation='relu'),
            Dense(1)
        ])
        model_nn.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        model_nn.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping], verbose=0)
        
        return model_nn

    for day_type in ['weekday', 'weekend']:
        df_train_subset = df_train[df_train['day_of_week'] == day_type]
        df_test_subset = df_test[df_test['day_of_week'] == day_type]

        # Train and evaluate Linear Regression model
        model_lr, selector = train_lr_model(df_train_subset)
        x_test = df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1).iloc[:, selector.support_]
        y_pred_lr = model_lr.predict(x_test)
        mse_lr = mean_squared_error(df_test_subset['arrival_delay'], y_pred_lr)
        mae_lr = mean_absolute_error(df_test_subset['arrival_delay'], y_pred_lr)
        r2_lr = r2_score(df_test_subset['arrival_delay'], y_pred_lr)
        # Train and evaluate Neural Network
        model_nn = train_nn_model(df_train_subset)
        y_pred_nn = model_nn.predict(df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)).flatten()
        mse_nn = mean_squared_error(df_test_subset['arrival_delay'], y_pred_nn)
        mae_nn = mean_absolute_error(df_test_subset['arrival_delay'], y_pred_nn)
        r2_nn = r2_score(df_test_subset['arrival_delay'], y_pred_nn)

        # Choose the best model based on MAE
        if mae_nn < mae_lr:
            best_model = "NN"
            best_mse = mse_nn
            best_mae = mae_nn
            best_r2 = r2_nn
        else:
            best_model = "LR"
            best_mse = mse_lr
            best_mae = mae_lr
            best_r2 = r2_lr

        results['day_type'].append(day_type)
        results['best_model'].append(best_model)
        results['MSE'].append(best_mse)
        results['MAE'].append(best_mae)
        results['R^2'].append(best_r2)
    return pd.DataFrame(results)

# Example usage:
# df = ...  # Your dataframe
# result_df = train_and_evaluate(df)
# print(result_df)


In [11]:
results = train_and_evaluate(df)
print(results)

  day_type best_model         MSE        MAE       R^2
0  weekday         LR  403.634672  13.013826  0.986993
1  weekend         LR  343.030509  11.043053  0.992785


# NN VS LR

In [None]:
df.info()

In [63]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import os

def train_evaluate_best_model(df, n_folds=5):
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    results = {
        'stop_sequence': [],
        'day_type': [],
        'best_model': [],
        'R^2': [],
        'MAE': []
    }

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            print(f"Processing stop_sequence {stop_seq} for {day_type}...")
            
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]

            drop_columns = ['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"]
            x_train = df_train_subset.drop(columns=drop_columns, errors='ignore').astype('float32')
            y_train = df_train_subset['arrival_delay'].astype('float32')

            # Normalize the data for NN
            scaler = StandardScaler()
            x_train_normalized = scaler.fit_transform(x_train)
            
            # Save the scaler
            if not os.path.exists("scalers"):
                os.makedirs("scalers")
            joblib.dump(scaler, os.path.join("scalers", f'scaler_{stop_seq}_{day_type}.pkl'))

            r2_scores_nn = []
            r2_scores_lr = []
            maes_nn = []
            maes_lr = []

            # K-fold CV
            kf = KFold(n_splits=n_folds)
            for fold, (train_index, val_index) in enumerate(kf.split(x_train_normalized)):
                print(f"Running Fold {fold + 1}...")

                x_train_fold, x_val_fold = x_train_normalized[train_index], x_train_normalized[val_index]
                y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                # Train and evaluate NN
                print("Training Neural Network...")
                model_nn = Sequential([
                    Dense(64, activation='relu', input_dim=x_train_fold.shape[1]),
                    Dense(32, activation='relu'),
                    Dense(16, activation='relu'),
                    Dense(1)
                ])
                model_nn.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
                early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
                model_nn.fit(x_train_fold, y_train_fold, validation_data=(x_val_fold, y_val_fold), epochs=100, batch_size=16, callbacks=[early_stopping], verbose=0)
                y_pred_nn = model_nn.predict(x_val_fold).flatten()
                r2_scores_nn.append(r2_score(y_val_fold, y_pred_nn))
                maes_nn.append(mean_absolute_error(y_val_fold, y_pred_nn))

                # Train and evaluate LR
                print("Training Linear Regression...")
                model_lr = LinearRegression().fit(x_train_fold, y_train_fold)
                y_pred_lr = model_lr.predict(x_val_fold)
                r2_scores_lr.append(r2_score(y_val_fold, y_pred_lr))
                maes_lr.append(mean_absolute_error(y_val_fold, y_pred_lr))

            # Determine best model and save it
            avg_mae_nn = np.mean(maes_nn)
            avg_mae_lr = np.mean(maes_lr)
            print("LR:",avg_mae_lr)
            print("NN:",avg_mae_nn)
            if avg_mae_nn < avg_mae_lr:
                best_model = "NN"
                best_r2 = np.mean(r2_scores_nn)
                best_mae = avg_mae_nn
                # Save NN model
                if not os.path.exists("models"):
                    os.makedirs("models")
                model_nn.save(os.path.join("models", f'nn_model_{stop_seq}_{day_type}.h5'))
            else:
                best_model = "LR"
                best_r2 = np.mean(r2_scores_lr)
                best_mae = avg_mae_lr
                # Save LR model
                if not os.path.exists("models"):
                    os.makedirs("models")
                joblib.dump(model_lr, os.path.join("models", f'lr_model_{stop_seq}_{day_type}.pkl'))

            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['best_model'].append(best_model)
            results['R^2'].append(best_r2)
            results['MAE'].append(best_mae)

    return pd.DataFrame(results)

def predict_new_data(df):
    all_groups = []

    grouped = df.groupby(['stop_sequence', 'day_of_week'])
    for (stop_seq, day_type), group in grouped:
        nn_model_path = os.path.join("models", f'nn_model_{stop_seq}_{day_type}.h5')
        lr_model_path = os.path.join("models", f'lr_model_{stop_seq}_{day_type}.pkl')
        scaler_path = os.path.join("scalers", f'scaler_{stop_seq}_{day_type}.pkl')  # Load the scaler
        
        X = group.drop(columns=['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], errors='ignore').astype('float32')

        # Normalize the data with the saved scaler
        scaler = joblib.load(scaler_path)
        X_normalized = scaler.transform(X)

        if os.path.exists(nn_model_path):
            model = load_model(nn_model_path)
            group['predicted_delay'] = model.predict(X_normalized).flatten()
        elif os.path.exists(lr_model_path):
            model = joblib.load(lr_model_path)
            group['predicted_delay'] = model.predict(X)
        else:
            raise ValueError(f"No saved model found for stop_sequence: {stop_seq} and day_type: {day_type}")

        all_groups.append(group)

    result_df = pd.concat(all_groups)
    return result_df


### Creating the models

In [None]:
results_df = train_evaluate_best_model(df)
print(results_df)

Processing stop_sequence 1 for weekday...
Running Fold 1...
Training Neural Network...
Training Linear Regression...
Running Fold 2...
Training Neural Network...
Training Linear Regression...
Running Fold 3...
Training Neural Network...
Training Linear Regression...
Running Fold 4...
Training Neural Network...
Training Linear Regression...
Running Fold 5...
Training Neural Network...
Training Linear Regression...
LR: 28.80708
NN: 28.301798
Processing stop_sequence 1 for weekend...
Running Fold 1...
Training Neural Network...


  saving_api.save_model(


Training Linear Regression...
Running Fold 2...
Training Neural Network...
Training Linear Regression...
Running Fold 3...
Training Neural Network...
Training Linear Regression...
Running Fold 4...
Training Neural Network...
Training Linear Regression...
Running Fold 5...
Training Neural Network...
Training Linear Regression...
LR: 26.756924
NN: 27.07781
Processing stop_sequence 2 for weekday...
Running Fold 1...
Training Neural Network...
Training Linear Regression...
Running Fold 2...
Training Neural Network...
Training Linear Regression...
Running Fold 3...
Training Neural Network...
Training Linear Regression...
Running Fold 4...
Training Neural Network...
Training Linear Regression...
Running Fold 5...
Training Neural Network...
Training Linear Regression...
LR: 7.7936544
NN: 8.108984
Processing stop_sequence 2 for weekend...
Running Fold 1...
Training Neural Network...
Training Linear Regression...
Running Fold 2...
Training Neural Network...
Training Linear Regression...
Running

### USING the trained models

In [51]:
pred = predict_new_data(df)
#print(pred)






















In [52]:
pred.head(30000)

Unnamed: 0,Calendar_date,route_id,bus_id,stop_sequence,arrival_delay,dwell_time,travel_time_for_previous_section,scheduled_travel_time,upstream_stop_delay,origin_delay,...,factor(weather)Snow,factor(temperature)Cold,factor(temperature)Extra_cold,factor(temperature)Normal,factor(day_of_week)weekday,factor(day_of_week)weekend,factor(time_of_day)Afternoon_peak,factor(time_of_day)Morning_peak,factor(time_of_day)Off-peak,predicted_delay
3186,20220110,4,44410,1,49,0,0,120,8,8,...,0,1,0,0,1,0,0,1,0,1.267187e+08
3213,20220110,4,41370,1,65,0,0,120,47,47,...,0,1,0,0,1,0,0,1,0,1.267306e+08
3240,20220110,4,41353,1,23,0,0,120,19,19,...,0,1,0,0,1,0,0,1,0,1.267272e+08
3267,20220110,4,44413,1,72,0,0,120,39,39,...,0,1,0,0,1,0,0,1,0,1.267221e+08
3294,20220110,4,45544,1,182,0,0,120,144,144,...,0,1,0,0,1,0,0,1,0,1.267323e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342469,20220420,4,44059,2,109,83,153,90,60,87,...,0,0,0,1,1,0,1,0,0,-3.911516e+06
342496,20220420,4,44066,2,242,119,101,90,156,235,...,0,0,0,1,1,0,1,0,0,-3.898265e+06
342523,20220420,4,45536,2,267,133,204,90,186,162,...,0,0,0,1,1,0,1,0,0,-3.894295e+06
342550,20220420,4,41354,2,130,68,167,90,93,106,...,0,0,0,1,1,0,1,0,0,-3.907342e+06


In [30]:
# Extracting the actual and predicted values from the DataFrame
actual_values = pred["arrival_delay"].values
predicted_values = pred["predicted_delay"].values

# Computing R^2
r2 = r2_score(actual_values, predicted_values)

# Computing MAE
mae = mean_absolute_error(actual_values, predicted_values)

print(f"R^2: {r2:.4f}")
print(f"MAE: {mae:.4f}")



R^2: -287821276658.2489
MAE: 45913565.2692


# NN VS LR (Day type onlye)

In [None]:
def train_evaluate_day_based_model(df, n_folds=5):
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    results = {
        'day_type': [],
        'best_model': [],
        'R^2': [],
        'MAE': []
    }

    for day_type in ['weekday', 'weekend']:
        print(f"Processing for {day_type}...")
            
        df_train_subset = df_train[df_train['day_of_week'] == day_type]
        df_test_subset = df_test[df_test['day_of_week'] == day_type]

        drop_columns = ['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"]
        x_train = df_train_subset.drop(columns=drop_columns, errors='ignore').astype('float32')
        y_train = df_train_subset['arrival_delay'].astype('float32')

        scaler = StandardScaler()
        x_train_normalized = scaler.fit_transform(x_train)

        r2_scores_nn = []
        r2_scores_lr = []
        maes_nn = []
        maes_lr = []

        kf = KFold(n_splits=n_folds)
        for fold, (train_index, val_index) in enumerate(kf.split(x_train_normalized)):
            print(f"Running Fold {fold + 1}...")

            x_train_fold, x_val_fold = x_train_normalized[train_index], x_train_normalized[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            # Neural Network
            print("Training Neural Network...")
            model_nn = Sequential([
                Dense(64, activation='relu', input_dim=x_train_fold.shape[1]),
                Dense(32, activation='relu'),
                Dense(16, activation='relu'),
                Dense(1)
            ])
            model_nn.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            model_nn.fit(x_train_fold, y_train_fold, validation_data=(x_val_fold, y_val_fold), epochs=100, batch_size=16, callbacks=[early_stopping], verbose=0)
            y_pred_nn = model_nn.predict(x_val_fold).flatten()
            r2_scores_nn.append(r2_score(y_val_fold, y_pred_nn))
            maes_nn.append(mean_absolute_error(y_val_fold, y_pred_nn))

            # Linear Regression
            print("Training Linear Regression...")
            model_lr = LinearRegression().fit(x_train_fold, y_train_fold)
            y_pred_lr = model_lr.predict(x_val_fold)
            r2_scores_lr.append(r2_score(y_val_fold, y_pred_lr))
            maes_lr.append(mean_absolute_error(y_val_fold, y_pred_lr))

        avg_mae_nn = np.mean(maes_nn)
        avg_mae_lr = np.mean(maes_lr)
        if avg_mae_nn < avg_mae_lr:
            best_model = "NN"
            best_r2 = np.mean(r2_scores_nn)
            best_mae = avg_mae_nn
            if not os.path.exists("models_day"):
                os.makedirs("models_day")
            model_nn.save(os.path.join("models_day", f'nn_model_{day_type}.h5'))
        else:
            best_model = "LR"
            best_r2 = np.mean(r2_scores_lr)
            best_mae = avg_mae_lr
            if not os.path.exists("models_day"):
                os.makedirs("models_day")
            joblib.dump(model_lr, os.path.join("models_day", f'lr_model_{day_type}.pkl'))

        results['day_type'].append(day_type)
        results['best_model'].append(best_model)
        results['R^2'].append(best_r2)
        results['MAE'].append(best_mae)

    return pd.DataFrame(results)


def predict_day_based_data(df):
    all_groups = []

    grouped = df.groupby(['day_of_week'])
    for day_type, group in grouped:
        nn_model_path = os.path.join("models_day", f'nn_model_{day_type}.h5')
        lr_model_path = os.path.join("models_day", f'lr_model_{day_type}.pkl')

        X = group.drop(columns=['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], errors='ignore').astype('float32')
        
        if os.path.exists(nn_model_path):
            model = load_model(nn_model_path)
            group['predicted_delay'] = model.predict(X).flatten()
        elif os.path.exists(lr_model_path):
            model = joblib.load(lr_model_path)
            group['predicted_delay'] = model.predict(X)
        else:
            raise ValueError(f"No saved model found for day_type: {day_type}")

        all_groups.append(group)

    result_df = pd.concat(all_groups)
    return result_df


### Creating Model

In [None]:
results_df_day = train_evaluate_day_based_model(df)
print(results_df_day)

### USING the trained model

In [None]:
pred = predict_day_based_data(df)
pred.head()
# Extracting the actual and predicted values from the DataFrame
actual_values = pred["arrival_delay"].values
predicted_values = pred["predicted_delay"].values

# Computing R^2
r2 = r2_score(actual_values, predicted_values)

# Computing MAE
mae = mean_absolute_error(actual_values, predicted_values)

print(f"R^2: {r2:.4f}")
print(f"MAE: {mae:.4f}")

