In [1]:
import pandas as pd

#Plot
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import calendar 
import calplot # actually used

# Score model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Model
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from xgboost import XGBRegressor
from scipy import stats





In [2]:

# Read the data
url = "https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/main/ProjectAssignmentData/Dataset-PT.csv"
df = pd.read_csv(url, header=1)

# Calculate z-scores for the 'arrival_delay' column
z_scores = stats.zscore(df['arrival_delay'])

# Get boolean array indicating the location of outliers
outliers = (z_scores > 7) | (z_scores < -7)

# Count the number of outliers
num_outliers = outliers.sum()

# Print the number of outliers
print(f"Number of outliers removed: {num_outliers}")

# Remove the outliers
df = df[~outliers]

# Verify the new size of the DataFrame
print(f"Size of the original DataFrame: {len(df)}")
#print(f"Size of the DataFrame after removing outliers: {len(df_no_outliers)}")


Number of outliers removed: 228
Size of the original DataFrame: 544875


# stop and daytime models

In [20]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFECV

def train_and_evaluate(df):
    # Split the entire dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    y_mean = df_test['arrival_delay'].mean()

    results = {
        'stop_sequence': [],
        'day_type': [],
        'R^2': [],
        'MAE': []
    }

    weighted_mae_sum = 0
    total_samples = 0
    overall_ssr = 0
    overall_tss = 0

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            # Filter data by stop sequence and day type
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]

            # Train model with RFECV
            x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
            y_train = df_train_subset['arrival_delay']

            model = LinearRegression()
            selector = RFECV(estimator=model, step=1, cv=KFold(5))
            selector = selector.fit(x_train, y_train)
            
            # Fit model with selected features
            model.fit(x_train.iloc[:, selector.support_], y_train)

            # Evaluate the model
            x_test = df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1).iloc[:, selector.support_]
            y_test = df_test_subset['arrival_delay']
            
            y_pred = model.predict(x_test)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            sample_count = len(y_test)
            weighted_mae_sum += mae * sample_count
            total_samples += sample_count

            residuals = y_test - y_pred
            ssr = sum(residuals**2)
            overall_ssr += ssr
            tss = sum((y_test - y_mean)**2)
            overall_tss += tss

            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['R^2'].append(r2)
            results['MAE'].append(mae)
            

    overall_r2 = 1 - (overall_ssr / overall_tss)
    overall_mae = weighted_mae_sum / total_samples

    print(f'Overall R^2: {overall_r2}')
    print(f'Overall MAE: {overall_mae}')

    return pd.DataFrame(results)

# Example usage:
results_df = train_and_evaluate(df)
print(results_df)


Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Done!
Overall R^2: 0.989619067772344
Overall MAE: 12.228547173116066
    stop_sequence day_type       R^2        MAE
0               1  weekday  0.907789  28.079150
1               1  weekend  0.945927  28.133992
2               2  weekday  0.994341   7.961897
3               2  weekend  0.996451   7.032571
4               3  weekday  0.994364   7.562599
5               3  weekend  0.997188   6.564969
6               4  weekday  0.969525  18.877087
7               4  weekend  0.989087  11.836640
8               5  weekday  0.979133  14.260601
9               5  weekend  0.993399  11.236295
10              6  weekday  0.988645  13.048595
11              6  weekend  0.993592 

# Only Daytype Model

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFECV

def train_and_evaluate(df):
    # Split the entire dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    y_mean = df_test['arrival_delay'].mean()

    # Dictionary to store models and selectors
    models = {}
    selectors = {}
    mse_values = {}
    mae_values = {}

    weighted_mae_sum = 0
    total_samples = 0
    overall_ssr = 0
    overall_tss = 0

    # Function to train a model with RFECV
    def train_model(df_train_subset):
        df_train_subset = df_train_subset.copy()
        x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
        y_train = df_train_subset['arrival_delay']

        model = LinearRegression()
        selector = RFECV(estimator=model, step=1, cv=KFold(5))
        selector = selector.fit(x_train, y_train)
        
        # Fit model with selected features
        model.fit(x_train.iloc[:, selector.support_], y_train)
        return model, selector

    # Function to evaluate a model
    def evaluate_model(model, selector, df_test_subset):
        nonlocal weighted_mae_sum, total_samples, overall_ssr, overall_tss
        
        df_test_subset = df_test_subset.copy()
        x_test = df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1).iloc[:, selector.support_]
        y_test = df_test_subset['arrival_delay']
        
        y_pred = model.predict(x_test)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        sample_count = len(y_test)
        weighted_mae_sum += mae * sample_count
        total_samples += sample_count

        residuals = y_test - y_pred
        ssr = sum(residuals**2)
        overall_ssr += ssr
        tss = sum((y_test - y_mean)**2)
        overall_tss += tss

        return mse, mae

    # Train and evaluate model for weekdays
    df_weekday_train = df_train[df_train['day_of_week'] == 'weekday']
    models['weekday'], selectors['weekday'] = train_model(df_weekday_train)
    mse_values['weekday'], mae_values['weekday'] = evaluate_model(models['weekday'], selectors['weekday'], df_test[df_test['day_of_week'] == 'weekday'])

    # Train and evaluate model for weekends
    df_weekend_train = df_train[df_train['day_of_week'] == 'weekend']
    models['weekend'], selectors['weekend'] = train_model(df_weekend_train)
    mse_values['weekend'], mae_values['weekend'] = evaluate_model(models['weekend'], selectors['weekend'], df_test[df_test['day_of_week'] == 'weekend'])

    overall_r2 = 1 - (overall_ssr / overall_tss)
    overall_mae = weighted_mae_sum / total_samples

    print(f'R^2: {overall_r2}')
    print(f'MAE: {overall_mae}')

    return models, mse_values, mae_values, selectors

# Example usage:
# trained_models, mse_values, mae_values, selectors = train_and_evaluate(df)


In [5]:
trained_models, mse_values, mae_values, selectors = train_and_evaluate(df)
print(trained_models, mse_values, mae_values, selectors)

R^2: 0.9892383519558062
MAE: 12.554114894847544
{'weekday': LinearRegression(), 'weekend': LinearRegression()} {'weekday': 403.6346718077232, 'weekend': 343.03050945273117} {'weekday': 13.013826497688935, 'weekend': 11.043052622014638} {'weekday': RFECV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
      estimator=LinearRegression()), 'weekend': RFECV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
      estimator=LinearRegression())}


# Neural network stop and day

In [21]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

def train_and_evaluate_nn(df):
    # Split the dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
    sc = StandardScaler()

    y_mean = df_test['arrival_delay'].mean()
    overall_ssr = 0
    overall_tss = 0

    results = {
        'stop_sequence': [],
        'day_type': [],
        'R^2': [],
        'MAE': []
    }

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            # Filter data
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]

            # Prepare data
            x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
            y_train = df_train_subset['arrival_delay']
            
            x_test = df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
            y_test = df_test_subset['arrival_delay']

            # Normalize the input features
            x_train = sc.fit_transform(x_train)
            x_test = sc.transform(x_test)

            # Neural network model
            model = Sequential()
            model.add(Dense(32, activation='relu', input_dim=x_train.shape[1]))
            model.add(Dropout(0.2))
            model.add(Dense(16, activation='relu'))
            model.add(Dense(1))

            model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

            model.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)
            
            y_pred = model.predict(x_test).flatten()
            current_r2 = r2_score(y_test, y_pred)
            current_mae = mean_absolute_error(y_test, y_pred)
            
            residuals = y_test - y_pred
            ssr = sum(residuals**2)
            overall_ssr += ssr
            tss = sum((y_test - y_mean)**2)
            overall_tss += tss

            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['R^2'].append(current_r2)
            results['MAE'].append(current_mae)

    overall_r2 = 1 - (overall_ssr / overall_tss)
    print(f'Overall R^2 for Neural Network: {overall_r2}')

    return pd.DataFrame(results)

# Example usage:
results_df = train_and_evaluate_nn(df)
print(results_df)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

KeyboardInterrupt: 

In [None]:
trained_models, mse_values, mae_values = train_and_evaluate(df)

# NN VS LR

In [28]:
# Assuming df is already loaded and available
unique_stops = df['stop_sequence'].unique()
selected_stops = unique_stops[:4]

df_test = df[df['stop_sequence'].isin(selected_stops)]


In [30]:
def train_evaluate_best_model(df):

    # Split the dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
    sc = StandardScaler()
    
    # Create a placeholder for results
    results = {
        'stop_sequence': [],
        'day_type': [],
        'best_model': [],
        'R^2': [],
        'MAE': []
    }

    for stop_seq in df['stop_sequence'].unique():
        for day_type in ['weekday', 'weekend']:
            # Filter data
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]
            
            # Common data preparation
            x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
            y_train = df_train_subset['arrival_delay']
            x_test = df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
            y_test = df_test_subset['arrival_delay']

            # Normalize the input features for NN
            x_train_nn = sc.fit_transform(x_train)
            x_test_nn = sc.transform(x_test)

            # Train and evaluate NN
            model_nn = Sequential()
            model_nn.add(Dense(32, activation='relu', input_dim=x_train_nn.shape[1]))
            model_nn.add(Dropout(0.2))
            model_nn.add(Dense(16, activation='relu'))
            model_nn.add(Dense(1))
            model_nn.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            model_nn.fit(x_train_nn, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping], verbose=0)
            y_pred_nn = model_nn.predict(x_test_nn).flatten()
            r2_nn = r2_score(y_test, y_pred_nn)
            mae_nn = mean_absolute_error(y_test, y_pred_nn)

            # Train and evaluate Linear Regression
            model_lr = LinearRegression()
            selector = RFECV(estimator=model_lr, step=1, cv=KFold(5))
            selector = selector.fit(x_train, y_train)
            model_lr.fit(x_train.iloc[:, selector.support_], y_train)
            y_pred_lr = model_lr.predict(x_test.iloc[:, selector.support_])
            r2_lr = r2_score(y_test, y_pred_lr)
            mae_lr = mean_absolute_error(y_test, y_pred_lr)

            # Compare and store results
            if r2_nn > r2_lr:
                best_model = "NN"
                best_r2 = r2_nn
                best_mae = mae_nn
            else:
                best_model = "LR"
                best_r2 = r2_lr
                best_mae = mae_lr

            results['stop_sequence'].append(stop_seq)
            results['day_type'].append(day_type)
            results['best_model'].append(best_model)
            results['R^2'].append(best_r2)
            results['MAE'].append(best_mae)
            print("Done")
            
    return pd.DataFrame(results)




In [None]:
results_df = train_evaluate_best_model(df_test)
print(results_df)

Done
Done
Done
Done
