In [43]:
!pip install calplot pandas



In [44]:
import pandas as pd

#Plot
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import calendar
import calplot # actually used

# Score model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

In [45]:
url = "https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/main/ProjectAssignmentData/Dataset-PT.csv"
df = pd.read_csv(url,header=1)

In [46]:
df.head(5)
# df.size 545104 rows

Unnamed: 0,Calendar_date,route_id,bus_id,stop_sequence,arrival_delay,dwell_time,travel_time_for_previous_section,scheduled_travel_time,upstream_stop_delay,origin_delay,...,factor(weather)Rain,factor(weather)Snow,factor(temperature)Cold,factor(temperature)Extra_cold,factor(temperature)Normal,factor(day_of_week)weekday,factor(day_of_week)weekend,factor(time_of_day)Afternoon_peak,factor(time_of_day)Morning_peak,factor(time_of_day)Off-peak
0,20220108,4,41344,1,151,0,0,120,100,100,...,0,0,0,0,1,0,1,0,0,1
1,20220108,4,41344,2,185,24,171,45,151,100,...,0,0,0,0,1,0,1,0,0,1
2,20220108,4,41344,3,186,0,55,41,185,100,...,0,0,0,0,1,0,1,0,0,1
3,20220108,4,41344,4,202,12,42,94,186,100,...,0,0,0,0,1,0,1,0,0,1
4,20220108,4,41344,5,242,21,98,86,202,100,...,0,0,0,0,1,0,1,0,0,1


In [47]:
from scipy import stats

# Calculate z-scores for the 'arrival_delay' column
z_scores = stats.zscore(df['arrival_delay'])

# Get boolean array indicating the location of outliers
outliers = (z_scores > 7) | (z_scores < -7)

# Count the number of outliers
num_outliers = outliers.sum()

# Print the number of outliers
print(f"Number of outliers removed: {num_outliers}")

# Remove the outliers
df_no_outliers = df[~outliers]

# Verify the new size of the DataFrame
print(f"Size of the original DataFrame: {len(df)}")
print(f"Size of the DataFrame after removing outliers: {len(df_no_outliers)}")

Number of outliers removed: 228
Size of the original DataFrame: 545103
Size of the DataFrame after removing outliers: 544875


In [48]:
desired_stop_sequences = [0, 1, 5, 8, 12, 14, 16, 19, 20, 21, 23]  # Replace with your specific stop sequence number

# Create a boolean mask for rows with the desired stop sequence
mask = df_no_outliers['stop_sequence'].isin(desired_stop_sequences)

# Filter the DataFrame to keep only the rows with the desired stop sequence
df_selected_stops = df_no_outliers[mask]

df_selected_stops.shape

(201801, 31)

# Model with weather

### Neural Model

In [49]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

def train_and_evaluate_nn(df):
    # Split the dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)  # Use df, not df_selected_stops

    if df_train.empty or df_test.empty:
        print("Training or testing dataset is empty.")
        return None  # Return early if the dataset is empty

    sc = StandardScaler()

    y_mean = df_test['arrival_delay'].mean()
    overall_ssr = 0  # Initialize overall_ssr
    overall_tss = 0  # Initialize overall_tss
    overall_mae = 0  # Initialize overall MAE

    results = {
        'stop_sequence': [],
        'day_type': [],
        'R^2': [],
        'MAE': []
    }

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            # Filter data
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]

            # Prepare data
            x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
            y_train = df_train_subset['arrival_delay']

            x_test = df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
            y_test = df_test_subset['arrival_delay']

            # Normalize the input features
            x_train = sc.fit_transform(x_train)
            x_test = sc.transform(x_test)

            # Neural network model
            model = Sequential()
            model.add(Dense(32, activation='linear', input_dim=x_train.shape[1]))
            model.add(Dropout(0.001))
            model.add(Dense(64, activation='linear'))
            model.add(Dense(1))

            model.compile(optimizer='adam', loss='mae', metrics=['mae'])

            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

            model.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)

            y_pred = model.predict(x_test).flatten()
            current_r2 = r2_score(y_test, y_pred)
            current_mae = mean_absolute_error(y_test, y_pred)

            residuals = y_test - y_pred
            ssr = sum(residuals**2)
            overall_ssr += ssr
            tss = sum((y_test - y_mean)**2)
            overall_tss += tss

            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['R^2'].append(current_r2)
            results['MAE'].append(current_mae)

    overall_r2 = 1 - (overall_ssr / overall_tss)

    print(f'Overall R^2 for Neural Network: {overall_r2}')
    print(f'Overall MAE for Neural Network: {overall_mae}')

    return pd.DataFrame(results)

# Example usage:
results_df = train_and_evaluate_nn(df_selected_stops)
if results_df is not None:
    print(results_df)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 1/100
E

### Regression Model

In [50]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFECV

def train_and_evaluate(df):
    # Split the entire dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    y_mean = df_test['arrival_delay'].mean()

    results = {
        'stop_sequence': [],
        'day_type': [],
        'R^2': [],
        'MAE': []
    }

    weighted_mae_sum = 0
    total_samples = 0
    overall_ssr = 0
    overall_tss = 0

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            # Filter data by stop sequence and day type
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]

            # Train model with RFECV
            x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
            y_train = df_train_subset['arrival_delay']

            model = LinearRegression()
            selector = RFECV(estimator=model, step=1, cv=KFold(5))
            selector = selector.fit(x_train, y_train)

            # Fit model with selected features
            model.fit(x_train.iloc[:, selector.support_], y_train)

            # Evaluate the model
            x_test = df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1).iloc[:, selector.support_]
            y_test = df_test_subset['arrival_delay']

            y_pred = model.predict(x_test)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            sample_count = len(y_test)
            weighted_mae_sum += mae * sample_count
            total_samples += sample_count

            residuals = y_test - y_pred
            ssr = sum(residuals**2)
            overall_ssr += ssr
            tss = sum((y_test - y_mean)**2)
            overall_tss += tss

            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['R^2'].append(r2)
            results['MAE'].append(mae)


    overall_r2 = 1 - (overall_ssr / overall_tss)
    overall_mae = weighted_mae_sum / total_samples

    print(f'Overall R^2: {overall_r2}')
    print(f'Overall MAE: {overall_mae}')

    return pd.DataFrame(results)

# Example usage:
results_df = train_and_evaluate(df_selected_stops)
print(results_df)

Overall R^2: 0.9887827662407284
Overall MAE: 13.757458257476697
    stop_sequence day_type       R^2        MAE
0               1  weekday  0.906024  28.579119
1               1  weekend  0.962977  27.315545
2               5  weekday  0.981150  14.408368
3               5  weekend  0.991836  11.804386
4               8  weekday  0.979774  16.290035
5               8  weekend  0.993258  12.420537
6              12  weekday  0.991950  12.045683
7              12  weekend  0.995815  10.752941
8              14  weekday  0.987545  14.890875
9              14  weekend  0.991588  15.946099
10             16  weekday  0.975844  20.081454
11             16  weekend  0.992541  14.902667
12             19  weekday  0.995727   8.051218
13             19  weekend  0.997954   7.066374
14             20  weekday  0.994995   9.300995
15             20  weekend  0.998012   7.482150
16             21  weekday  0.991727  13.204515
17             21  weekend  0.996472  10.654456
18             23  weekd

# Model without weather

In [55]:
desired_stop_sequences = [0, 1, 5, 8, 12, 14, 16, 19, 20, 21, 23]  # Replace with your specific stop sequence number

# Create a boolean mask for rows with the desired stop sequence
mask = df_no_outliers['stop_sequence'].isin(desired_stop_sequences)

# Filter the DataFrame to keep only the rows with the desired stop sequence
df_selected_stops2 = df_no_outliers[mask]

df_selected_stops2.shape

(201801, 31)

In [68]:
df2 = df_selected_stops2.drop(['Calendar_date','route_id', 'bus_id'], axis = 1)
columns_to_drop = [
    'weather', 'temperature', 'time_of_day',
    'factor(weather)Light_Rain', 'factor(weather)Light_Snow',
    'factor(weather)Normal', 'factor(weather)Rain', 'factor(weather)Snow',
    'factor(temperature)Cold', 'factor(temperature)Extra_cold',
    'factor(temperature)Normal', 'factor(day_of_week)weekday',
    'factor(day_of_week)weekend', 'factor(time_of_day)Afternoon_peak',
    'factor(time_of_day)Morning_peak', 'factor(time_of_day)Off-peak'
]
for column in columns_to_drop:
    df2.pop(column)

In [69]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201801 entries, 0 to 545098
Data columns (total 12 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   stop_sequence                     201801 non-null  int64 
 1   arrival_delay                     201801 non-null  int64 
 2   dwell_time                        201801 non-null  int64 
 3   travel_time_for_previous_section  201801 non-null  int64 
 4   scheduled_travel_time             201801 non-null  int64 
 5   upstream_stop_delay               201801 non-null  int64 
 6   origin_delay                      201801 non-null  int64 
 7   previous_bus_delay                201801 non-null  int64 
 8   previous_trip_travel_time         201801 non-null  int64 
 9   traffic_condition                 201801 non-null  int64 
 10  recurrent_delay                   201801 non-null  int64 
 11  day_of_week                       201801 non-null  object
dtypes:

### Neural Network Model

In [71]:
def train_and_evaluate_nn(df):
    # Split the dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)  # Use df, not df_selected_stops

    if df_train.empty or df_test.empty:
        print("Training or testing dataset is empty.")
        return None  # Return early if the dataset is empty

    sc = StandardScaler()

    y_mean = df_test['arrival_delay'].mean()
    overall_ssr = 0  # Initialize overall_ssr
    overall_tss = 0  # Initialize overall_tss
    overall_mae = 0  # Initialize overall MAE

    results = {
        'stop_sequence': [],
        'day_type': [],
        'R^2': [],
        'MAE': []
    }

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            # Filter data
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]

            # Prepare data
            x_train = df_train_subset.drop(['arrival_delay', 'day_of_week'], axis=1)
            y_train = df_train_subset['arrival_delay']

            x_test = df_test_subset.drop(['arrival_delay', 'day_of_week'], axis=1)
            y_test = df_test_subset['arrival_delay']

            # Normalize the input features
            x_train = sc.fit_transform(x_train)
            x_test = sc.transform(x_test)

            # Neural network model
            model = Sequential()
            model.add(Dense(32, activation='linear', input_dim=x_train.shape[1]))
            model.add(Dropout(0.001))
            model.add(Dense(64, activation='linear'))
            model.add(Dense(1))

            model.compile(optimizer='adam', loss='mae', metrics=['mae'])

            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

            model.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)

            y_pred = model.predict(x_test).flatten()
            current_r2 = r2_score(y_test, y_pred)
            current_mae = mean_absolute_error(y_test, y_pred)

            residuals = y_test - y_pred
            ssr = sum(residuals**2)
            overall_ssr += ssr
            tss = sum((y_test - y_mean)**2)
            overall_tss += tss

            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['R^2'].append(current_r2)
            results['MAE'].append(current_mae)

    overall_r2 = 1 - (overall_ssr / overall_tss)

    print(f'Overall R^2 for Neural Network: {overall_r2}')
    print(f'Overall MAE for Neural Network: {overall_mae}')

    return pd.DataFrame(results)

# Example usage:
results_df = train_and_evaluate_nn(df2)
if results_df is not None:
    print(results_df)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/

### Regression Model

In [74]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFECV

def train_and_evaluate(df):
    # Split the entire dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    y_mean = df_test['arrival_delay'].mean()

    results = {
        'stop_sequence': [],
        'day_type': [],
        'R^2': [],
        'MAE': []
    }

    weighted_mae_sum = 0
    total_samples = 0
    overall_ssr = 0
    overall_tss = 0

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            # Filter data by stop sequence and day type
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]

            # Train model with RFECV
            x_train = df_train_subset.drop(['arrival_delay', 'day_of_week'], axis=1)
            y_train = df_train_subset['arrival_delay']

            model = LinearRegression()
            selector = RFECV(estimator=model, step=1, cv=KFold(5))
            selector = selector.fit(x_train, y_train)

            # Fit model with selected features
            model.fit(x_train.iloc[:, selector.support_], y_train)

            # Evaluate the model
            x_test = df_test_subset.drop(['arrival_delay', 'day_of_week'], axis=1).iloc[:, selector.support_]
            y_test = df_test_subset['arrival_delay']

            y_pred = model.predict(x_test)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            sample_count = len(y_test)
            weighted_mae_sum += mae * sample_count
            total_samples += sample_count

            residuals = y_test - y_pred
            ssr = sum(residuals**2)
            overall_ssr += ssr
            tss = sum((y_test - y_mean)**2)
            overall_tss += tss

            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['R^2'].append(r2)
            results['MAE'].append(mae)


    overall_r2 = 1 - (overall_ssr / overall_tss)
    overall_mae = weighted_mae_sum / total_samples

    print(f'Overall R^2: {overall_r2}')
    print(f'Overall MAE: {overall_mae}')

    return pd.DataFrame(results)

# Example usage:
results_df = train_and_evaluate(df2)
print(results_df)

Overall R^2: 0.9887131991644134
Overall MAE: 13.802761161242874
    stop_sequence day_type       R^2        MAE
0               1  weekday  0.905814  28.669563
1               1  weekend  0.962507  27.498654
2               5  weekday  0.980929  14.488254
3               5  weekend  0.991648  11.942039
4               8  weekday  0.979634  16.346677
5               8  weekend  0.993247  12.418298
6              12  weekday  0.991910  12.081522
7              12  weekend  0.995837  10.709357
8              14  weekday  0.987454  14.903614
9              14  weekend  0.991567  15.987618
10             16  weekday  0.975778  20.084594
11             16  weekend  0.992523  14.931404
12             19  weekday  0.995635   8.189313
13             19  weekend  0.997917   7.042332
14             20  weekday  0.994866   9.377015
15             20  weekend  0.998005   7.479137
16             21  weekday  0.991709  13.200955
17             21  weekend  0.996486  10.622142
18             23  weekd

### LR VS NN

In [75]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201801 entries, 0 to 545098
Data columns (total 12 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   stop_sequence                     201801 non-null  int64 
 1   arrival_delay                     201801 non-null  int64 
 2   dwell_time                        201801 non-null  int64 
 3   travel_time_for_previous_section  201801 non-null  int64 
 4   scheduled_travel_time             201801 non-null  int64 
 5   upstream_stop_delay               201801 non-null  int64 
 6   origin_delay                      201801 non-null  int64 
 7   previous_bus_delay                201801 non-null  int64 
 8   previous_trip_travel_time         201801 non-null  int64 
 9   traffic_condition                 201801 non-null  int64 
 10  recurrent_delay                   201801 non-null  int64 
 11  day_of_week                       201801 non-null  object
dtypes:

In [76]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import os

def train_evaluate_best_model(df, n_folds=5):

    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    results = {
        'stop_sequence': [],
        'day_type': [],
        'best_model': [],
        'R^2': [],
        'MAE': []
    }

    overall_r2_values = []
    overall_sample_weights = []

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]

            drop_columns = ['arrival_delay', 'day_of_week']
            x_train = df_train_subset.drop(columns=drop_columns, errors='ignore').astype('float32')
            y_train = df_train_subset['arrival_delay'].astype('float32')
            x_test = df_test_subset.drop(columns=drop_columns, errors='ignore').astype('float32')
            y_test = df_test_subset['arrival_delay'].astype('float32')

            r2_scores_nn = []
            r2_scores_lr = []
            maes_nn = []
            maes_lr = []

            # K-fold CV
            kf = KFold(n_splits=n_folds)
            for train_index, val_index in kf.split(x_train):
                x_train_fold, x_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
                y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                # Train and evaluate NN
                model_nn = Sequential([
                    Dense(32, activation='relu', input_dim=x_train_fold.shape[1]),
                    Dropout(0.001),
                    Dense(64, activation='relu'),
                    Dense(1)
                ])
                model_nn.compile(optimizer='adam', loss='mae', metrics=['mae'])
                early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
                model_nn.fit(x_train_fold, y_train_fold, validation_data=(x_val_fold, y_val_fold), epochs=100, batch_size=32, callbacks=[early_stopping], verbose=0)
                y_pred_nn = model_nn.predict(x_val_fold).flatten()
                r2_scores_nn.append(r2_score(y_val_fold, y_pred_nn))
                maes_nn.append(mean_absolute_error(y_val_fold, y_pred_nn))

                # Train and evaluate LR
                model_lr = LinearRegression().fit(x_train_fold, y_train_fold)
                y_pred_lr = model_lr.predict(x_val_fold)
                r2_scores_lr.append(r2_score(y_val_fold, y_pred_lr))
                maes_lr.append(mean_absolute_error(y_val_fold, y_pred_lr))

            # Average scores
            avg_r2_nn = np.mean(r2_scores_nn)
            avg_r2_lr = np.mean(r2_scores_lr)
            avg_mae_nn = np.mean(maes_nn)
            avg_mae_lr = np.mean(maes_lr)

            # Determine best model
            if avg_r2_nn > avg_r2_lr:
                best_model = "NN"
                best_r2 = avg_r2_nn
                best_mae = avg_mae_nn
            else:
                best_model = "LR"
                best_r2 = avg_r2_lr
                best_mae = avg_mae_lr

            # Save models
            model_dir = "models"
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            if best_model == "NN":
                model_nn.save(os.path.join(model_dir, f'nn_model_{stop_seq}_{day_type}.h5'))
            else:
                joblib.dump(model_lr, os.path.join(model_dir, f'lr_model_{stop_seq}_{day_type}.pkl'))

            overall_r2_values.append(best_r2)
            overall_sample_weights.append(len(df_test_subset))
            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['best_model'].append(best_model)
            results['R^2'].append(best_r2)
            results['MAE'].append(best_mae)

    overall_r2 = np.average(overall_r2_values, weights=overall_sample_weights)
    print(f"Overall R^2: {overall_r2:.2f}")

    return pd.DataFrame(results)

# predict_new_data function remains mostly the same.
def predict_new_data(df2):
    all_groups = []  # Placeholder for updated groups

    # Group the dataframe by 'stop_sequence' and 'day_of_week'
    grouped = df.groupby(['stop_sequence', 'day_of_week'])

    for (stop_seq, day_type), group in grouped:
        nn_model_path = os.path.join("models", f'nn_model_{stop_seq}_{day_type}.h5')
        lr_model_path = os.path.join("models", f'lr_model_{stop_seq}_{day_type}.pkl')

        X = group.drop(columns=['arrival_delay', 'day_of_week'], errors='ignore').astype('float32')

        if os.path.exists(nn_model_path):
            model = load_model(nn_model_path)
            group['predicted_delay'] = model.predict(X).flatten()

        elif os.path.exists(lr_model_path):
            model = joblib.load(lr_model_path)
            group['predicted_delay'] = model.predict(X)

        else:
            raise ValueError(f"No saved model found for stop_sequence: {stop_seq} and day_type: {day_type}")

        all_groups.append(group)

    # Concatenate all updated groups to get the final dataframe
    result_df = pd.concat(all_groups)

    return result_df

In [78]:
results_df = train_evaluate_best_model(df2)
print(results_df)



  saving_api.save_model(


Overall R^2: 0.98
    stop_sequence day_type best_model       R^2        MAE
0               1  weekday         NN  0.901781  27.110336
1               1  weekend         LR  0.954502  26.964746
2               5  weekday         LR  0.977974  14.600647
3               5  weekend         LR  0.992198  11.490544
4               8  weekday         LR  0.977179  16.290564
5               8  weekend         LR  0.992663  12.721334
6              12  weekday         LR  0.990531  12.285365
7              12  weekend         LR  0.996006  10.378367
8              14  weekday         LR  0.986910  14.954063
9              14  weekend         LR  0.991185  16.074905
10             16  weekday         LR  0.976267  20.499210
11             16  weekend         LR  0.992088  14.911676
12             19  weekday         LR  0.995265   8.105009
13             19  weekend         LR  0.997908   7.037652
14             20  weekday         LR  0.994972   9.331365
15             20  weekend         LR 

In [79]:
for stop_seq in df2['stop_sequence'].unique():
    for day_type in ['weekday', 'weekend']:
        subset = df2[(df2['stop_sequence'] == stop_seq) & (df2['day_of_week'] == day_type)]
        print(f"stop_seq: {stop_seq}, day_type: {day_type}, size: {len(subset)}")

stop_seq: 1, day_type: weekday, size: 15480
stop_seq: 1, day_type: weekend, size: 4702
stop_seq: 5, day_type: weekday, size: 15480
stop_seq: 5, day_type: weekend, size: 4702
stop_seq: 8, day_type: weekday, size: 15479
stop_seq: 8, day_type: weekend, size: 4702
stop_seq: 12, day_type: weekday, size: 15480
stop_seq: 12, day_type: weekend, size: 4702
stop_seq: 14, day_type: weekday, size: 15480
stop_seq: 14, day_type: weekend, size: 4702
stop_seq: 16, day_type: weekday, size: 15480
stop_seq: 16, day_type: weekend, size: 4702
stop_seq: 19, day_type: weekday, size: 15473
stop_seq: 19, day_type: weekend, size: 4702
stop_seq: 20, day_type: weekday, size: 15477
stop_seq: 20, day_type: weekend, size: 4702
stop_seq: 21, day_type: weekday, size: 15476
stop_seq: 21, day_type: weekend, size: 4702
stop_seq: 23, day_type: weekday, size: 15476
stop_seq: 23, day_type: weekend, size: 4702


### Using the trained models

In [80]:
pred = predict_new_data(df2)
print(pred)

KeyError: ignored

In [None]:
pred.head()

In [None]:
# Extracting the actual and predicted values from the DataFrame
actual_values = pred["arrival_delay"].values
predicted_values = pred["predicGQHW§EÖted_delay"].values

# Computing R^2
r2 = r2_score(actual_values, predicted_values)

# Computing MAE
mae = mean_absolute_error(actual_values, predicted_values)

print(f"R^2: {r2:.4f}")
print(f"MAE: {mae:.4f}")