In [1]:
import pandas as pd

#Plot
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import calendar 
import calplot # actually used

# Score model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Model
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from xgboost import XGBRegressor
from scipy import stats





In [9]:
# Read the data
url = "https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/main/ProjectAssignmentData/Dataset-PT.csv"
df = pd.read_csv(url, header=1)
#df = df.drop(columns=['weather', 'temperature', 'day_of_week', 'time_of_day'])
# Calculate z-scores for the 'arrival_delay' column
z_scores = stats.zscore(df['arrival_delay'])

# Get boolean array indicating the location of outliers
outliers = (z_scores > 7) | (z_scores < -7)

# Count the number of outliers
num_outliers = outliers.sum()

# Print the number of outliers
print(f"Number of outliers removed: {num_outliers}")

# Remove the outliers
df = df[~outliers]

# Verify the new size of the DataFrame
print(f"Size of the original DataFrame: {len(df)}")
#print(f"Size of the DataFrame after removing outliers: {len(df_no_outliers)}")


Number of outliers removed: 228
Size of the original DataFrame: 544875


# stop and daytime models

In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFECV

def train_and_evaluate(df):
    # Split the entire dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    y_mean = df_test['arrival_delay'].mean()

    results = {
        'stop_sequence': [],
        'day_type': [],
        'R^2': [],
        'MAE': []
    }

    weighted_mae_sum = 0
    total_samples = 0
    overall_ssr = 0
    overall_tss = 0

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            # Filter data by stop sequence and day type
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]

            # Train model with RFECV
            x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
            y_train = df_train_subset['arrival_delay']

            model = LinearRegression()
            selector = RFECV(estimator=model, step=1, cv=KFold(5))
            selector = selector.fit(x_train, y_train)
            
            # Fit model with selected features
            model.fit(x_train.iloc[:, selector.support_], y_train)

            # Evaluate the model
            x_test = df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1).iloc[:, selector.support_]
            
            y_test = df_test_subset['arrival_delay']
            
            y_pred = model.predict(x_test)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            sample_count = len(y_test)
            weighted_mae_sum += mae * sample_count
            total_samples += sample_count

            residuals = y_test - y_pred
            ssr = sum(residuals**2)
            overall_ssr += ssr
            tss = sum((y_test - y_mean)**2)
            overall_tss += tss

            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['R^2'].append(r2)
            results['MAE'].append(mae)
            

    overall_r2 = 1 - (overall_ssr / overall_tss)
    overall_mae = weighted_mae_sum / total_samples

    print(f'Overall R^2: {overall_r2}')
    print(f'Overall MAE: {overall_mae}')

    return pd.DataFrame(results)

# Example usage:
results_df = train_and_evaluate(df)
print(results_df)


Overall R^2: 0.989619067772344
Overall MAE: 12.228547173116066
    stop_sequence day_type       R^2        MAE
0               1  weekday  0.907789  28.079150
1               1  weekend  0.945927  28.133992
2               2  weekday  0.994341   7.961897
3               2  weekend  0.996451   7.032571
4               3  weekday  0.994364   7.562599
5               3  weekend  0.997188   6.564969
6               4  weekday  0.969525  18.877087
7               4  weekend  0.989087  11.836640
8               5  weekday  0.979133  14.260601
9               5  weekend  0.993399  11.236295
10              6  weekday  0.988645  13.048595
11              6  weekend  0.993592  12.180406
12              7  weekday  0.991089  10.166501
13              7  weekend  0.995717   9.197009
14              8  weekday  0.979640  16.051619
15              8  weekend  0.993770  12.026367
16              9  weekday  0.994514   8.123681
17              9  weekend  0.997754   6.787489
18             10  weekda

# Only Daytype Model

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFECV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd

def train_and_evaluate(df):
    # Split the dataset
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    # Results placeholder
    results = {
        'day_type': [],
        'best_model': [],
        'MSE': [],
        'MAE': [],
        'R^2': []  
    }

    # Function to train and evaluate Linear Regression with RFECV
    def train_lr_model(df_train_subset):
        x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
        y_train = df_train_subset['arrival_delay']

        model = LinearRegression()
        selector = RFECV(estimator=model, step=1, cv=KFold(5))
        selector = selector.fit(x_train, y_train)

        model.fit(x_train.iloc[:, selector.support_], y_train)
        
        return model, selector

    # Function to train Neural Network
    def train_nn_model(df_train_subset):
        x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
        y_train = df_train_subset['arrival_delay']

        model_nn = Sequential([
            Dense(32, activation='relu', input_dim=x_train.shape[1]),
            Dropout(0.001),
            Dense(64, activation='relu'),
            Dense(1)
        ])
        model_nn.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        model_nn.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping], verbose=0)
        
        return model_nn

    for day_type in ['weekday', 'weekend']:
        df_train_subset = df_train[df_train['day_of_week'] == day_type]
        df_test_subset = df_test[df_test['day_of_week'] == day_type]

        # Train and evaluate Linear Regression model
        model_lr, selector = train_lr_model(df_train_subset)
        x_test = df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1).iloc[:, selector.support_]
        y_pred_lr = model_lr.predict(x_test)
        mse_lr = mean_squared_error(df_test_subset['arrival_delay'], y_pred_lr)
        mae_lr = mean_absolute_error(df_test_subset['arrival_delay'], y_pred_lr)
        r2_lr = r2_score(df_test_subset['arrival_delay'], y_pred_lr)
        # Train and evaluate Neural Network
        model_nn = train_nn_model(df_train_subset)
        y_pred_nn = model_nn.predict(df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)).flatten()
        mse_nn = mean_squared_error(df_test_subset['arrival_delay'], y_pred_nn)
        mae_nn = mean_absolute_error(df_test_subset['arrival_delay'], y_pred_nn)
        r2_nn = r2_score(df_test_subset['arrival_delay'], y_pred_nn)

        # Choose the best model based on MAE
        if mae_nn < mae_lr:
            best_model = "NN"
            best_mse = mse_nn
            best_mae = mae_nn
            best_r2 = r2_nn
        else:
            best_model = "LR"
            best_mse = mse_lr
            best_mae = mae_lr
            best_r2 = r2_lr

        results['day_type'].append(day_type)
        results['best_model'].append(best_model)
        results['MSE'].append(best_mse)
        results['MAE'].append(best_mae)
        results['R^2'].append(best_r2)
    return pd.DataFrame(results)

# Example usage:
# df = ...  # Your dataframe
# result_df = train_and_evaluate(df)
# print(result_df)


In [11]:
results = train_and_evaluate(df)
print(results)

  day_type best_model         MSE        MAE       R^2
0  weekday         LR  403.634672  13.013826  0.986993
1  weekend         LR  343.030509  11.043053  0.992785


# NN VS LR

In [None]:
df.info()

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import os

def train_evaluate_best_model(df, n_folds=5):

    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    results = {
        'stop_sequence': [],
        'day_type': [],
        'best_model': [],
        'R^2': [],
        'MAE': []
    }

    overall_r2_values = []
    overall_sample_weights = []

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]

            drop_columns = ['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"]
            x_train = df_train_subset.drop(columns=drop_columns, errors='ignore').astype('float32')
            y_train = df_train_subset['arrival_delay'].astype('float32')
            x_test = df_test_subset.drop(columns=drop_columns, errors='ignore').astype('float32')
            y_test = df_test_subset['arrival_delay'].astype('float32')

            r2_scores_nn = []
            r2_scores_lr = []
            maes_nn = []
            maes_lr = []

            # K-fold CV
            kf = KFold(n_splits=n_folds)
            for train_index, val_index in kf.split(x_train):
                x_train_fold, x_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
                y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                # Train and evaluate NN
                model_nn = Sequential([
                    Dense(32, activation='relu', input_dim=x_train_fold.shape[1]),
                    Dropout(0.001),
                    Dense(64, activation='relu'),
                    Dense(1)
                ])
                model_nn.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
                early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
                model_nn.fit(x_train_fold, y_train_fold, validation_data=(x_val_fold, y_val_fold), epochs=100, batch_size=32, callbacks=[early_stopping], verbose=0)
                y_pred_nn = model_nn.predict(x_val_fold).flatten()
                r2_scores_nn.append(r2_score(y_val_fold, y_pred_nn))
                maes_nn.append(mean_absolute_error(y_val_fold, y_pred_nn))

                # Train and evaluate LR
                model_lr = LinearRegression().fit(x_train_fold, y_train_fold)
                y_pred_lr = model_lr.predict(x_val_fold)
                r2_scores_lr.append(r2_score(y_val_fold, y_pred_lr))
                maes_lr.append(mean_absolute_error(y_val_fold, y_pred_lr))

            # Average scores
            avg_r2_nn = np.mean(r2_scores_nn)
            avg_r2_lr = np.mean(r2_scores_lr)
            avg_mae_nn = np.mean(maes_nn)
            avg_mae_lr = np.mean(maes_lr)

            # Determine best model
            if avg_r2_nn > avg_r2_lr:
                best_model = "NN"
                best_r2 = avg_r2_nn
                best_mae = avg_mae_nn
            else:
                best_model = "LR"
                best_r2 = avg_r2_lr
                best_mae = avg_mae_lr

            # Save models
            model_dir = "models"
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            if best_model == "NN":
                model_nn.save(os.path.join(model_dir, f'nn_model_{stop_seq}_{day_type}.h5'))
            else:
                joblib.dump(model_lr, os.path.join(model_dir, f'lr_model_{stop_seq}_{day_type}.pkl'))

            overall_r2_values.append(best_r2)
            overall_sample_weights.append(len(df_test_subset))
            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['best_model'].append(best_model)
            results['R^2'].append(best_r2)
            results['MAE'].append(best_mae)

    overall_r2 = np.average(overall_r2_values, weights=overall_sample_weights)
    print(f"Overall R^2: {overall_r2:.2f}")

    return pd.DataFrame(results)

# predict_new_data function remains mostly the same.
def predict_new_data(df):
    all_groups = []  # Placeholder for updated groups

    # Group the dataframe by 'stop_sequence' and 'day_of_week'
    grouped = df.groupby(['stop_sequence', 'day_of_week'])

    for (stop_seq, day_type), group in grouped:
        nn_model_path = os.path.join("models", f'nn_model_{stop_seq}_{day_type}.h5')
        lr_model_path = os.path.join("models", f'lr_model_{stop_seq}_{day_type}.pkl')

        X = group.drop(columns=['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], errors='ignore').astype('float32')

        if os.path.exists(nn_model_path):
            model = load_model(nn_model_path)
            group['predicted_delay'] = model.predict(X).flatten()

        elif os.path.exists(lr_model_path):
            model = joblib.load(lr_model_path)
            group['predicted_delay'] = model.predict(X)
            
        else:
            raise ValueError(f"No saved model found for stop_sequence: {stop_seq} and day_type: {day_type}")

        all_groups.append(group)

    # Concatenate all updated groups to get the final dataframe
    result_df = pd.concat(all_groups)

    return result_df


# Creating the models

In [18]:
results_df = train_evaluate_best_model(df)
print(results_df)



Overall R^2: 0.99
    stop_sequence day_type best_model       R^2        MAE
0               1  weekday         LR  0.900133  28.803116
1               1  weekend         LR  0.957862  26.720768
2               2  weekday         LR  0.993873   7.791570
3               2  weekend         LR  0.997195   7.002844
4               3  weekday         LR  0.994518   7.596034
5               3  weekend         LR  0.997320   6.597629
6               4  weekday         LR  0.967181  18.583790
7               4  weekend         LR  0.991606  11.797418
8               5  weekday         LR  0.978429  14.605721
9               5  weekend         LR  0.991654  11.572133
10              6  weekday         LR  0.986974  13.116409
11              6  weekend         LR  0.993378  12.216171
12              7  weekday         LR  0.992004  10.127344
13              7  weekend         LR  0.996052   9.032592
14              8  weekday         LR  0.977516  16.325077
15              8  weekend         LR 

In [16]:
for stop_seq in df['stop_sequence'].unique():
    for day_type in ['weekday', 'weekend']:
        subset = df[(df['stop_sequence'] == stop_seq) & (df['day_of_week'] == day_type)]
        print(f"stop_seq: {stop_seq}, day_type: {day_type}, size: {len(subset)}")


stop_seq: 1, day_type: weekday, size: 15480
stop_seq: 1, day_type: weekend, size: 4702
stop_seq: 2, day_type: weekday, size: 15481
stop_seq: 2, day_type: weekend, size: 4702
stop_seq: 3, day_type: weekday, size: 15481
stop_seq: 3, day_type: weekend, size: 4702
stop_seq: 4, day_type: weekday, size: 15480
stop_seq: 4, day_type: weekend, size: 4702
stop_seq: 5, day_type: weekday, size: 15480
stop_seq: 5, day_type: weekend, size: 4702
stop_seq: 6, day_type: weekday, size: 15478
stop_seq: 6, day_type: weekend, size: 4702
stop_seq: 7, day_type: weekday, size: 15480
stop_seq: 7, day_type: weekend, size: 4702
stop_seq: 8, day_type: weekday, size: 15479
stop_seq: 8, day_type: weekend, size: 4702
stop_seq: 9, day_type: weekday, size: 15480
stop_seq: 9, day_type: weekend, size: 4702
stop_seq: 10, day_type: weekday, size: 15480
stop_seq: 10, day_type: weekend, size: 4702
stop_seq: 11, day_type: weekday, size: 15480
stop_seq: 11, day_type: weekend, size: 4702
stop_seq: 12, day_type: weekday, size: 

# USING the trained models

In [None]:
pred = predict_new_data(df)
#print(pred)


In [None]:
pred.head()

In [None]:
# Extracting the actual and predicted values from the DataFrame
actual_values = pred["arrival_delay"].values
predicted_values = pred["predicGQHW§EÖted_delay"].values

# Computing R^2
r2 = r2_score(actual_values, predicted_values)

# Computing MAE
mae = mean_absolute_error(actual_values, predicted_values)

print(f"R^2: {r2:.4f}")
print(f"MAE: {mae:.4f}")

