In [2]:
import pandas as pd

#Plot
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import calendar 
import calplot # actually used

# Score model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Model
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from xgboost import XGBRegressor



In [12]:
import pandas as pd
from scipy import stats

# Read the data
url = "https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/main/ProjectAssignmentData/Dataset-PT.csv"
df = pd.read_csv(url, header=1)

# Calculate z-scores for the 'arrival_delay' column
z_scores = stats.zscore(df['arrival_delay'])

# Get boolean array indicating the location of outliers
outliers = (z_scores > 7) | (z_scores < -7)

# Count the number of outliers
num_outliers = outliers.sum()

# Print the number of outliers
print(f"Number of outliers removed: {num_outliers}")

# Remove the outliers
df_no_outliers = df[~outliers]

# Verify the new size of the DataFrame
print(f"Size of the original DataFrame: {len(df)}")
print(f"Size of the DataFrame after removing outliers: {len(df_no_outliers)}")


Number of outliers removed: 228
Size of the original DataFrame: 545103
Size of the DataFrame after removing outliers: 544875


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFECV

def train_and_evaluate(df):
    # Split the entire dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    y_mean = df_test['arrival_delay'].mean()

    # Dictionary to store models and selectors
    models = {}
    selectors = {}
    mse_values = {}
    mae_values = {}

    weighted_mae_sum = 0
    total_samples = 0
    overall_ssr = 0
    overall_tss = 0

    for stop_seq in df['stop_sequence'].unique():
        models[stop_seq] = {}
        selectors[stop_seq] = {}
        mse_values[stop_seq] = {}
        mae_values[stop_seq] = {}

        for day_type in ['weekday', 'weekend']:
            # Filter data by stop sequence and day type
            df_train_subset = df_train[(df_train['stop_sequence'] == stop_seq) & (df_train['day_of_week'] == day_type)]
            df_test_subset = df_test[(df_test['stop_sequence'] == stop_seq) & (df_test['day_of_week'] == day_type)]

            # Train model with RFECV
            x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
            y_train = df_train_subset['arrival_delay']

            model = LinearRegression()
            selector = RFECV(estimator=model, step=1, cv=KFold(5))
            selector = selector.fit(x_train, y_train)
            
            # Fit model with selected features
            model.fit(x_train.iloc[:, selector.support_], y_train)

            # Store the trained model and selector
            models[stop_seq][day_type] = model
            selectors[stop_seq][day_type] = selector

            # Evaluate the model
            x_test = df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1).iloc[:, selector.support_]
            y_test = df_test_subset['arrival_delay']
            
            y_pred = model.predict(x_test)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)

            sample_count = len(y_test)
            weighted_mae_sum += mae * sample_count
            total_samples += sample_count

            residuals = y_test - y_pred
            ssr = sum(residuals**2)
            overall_ssr += ssr
            tss = sum((y_test - y_mean)**2)
            overall_tss += tss

            mse_values[stop_seq][day_type] = mse
            mae_values[stop_seq][day_type] = mae

    overall_r2 = 1 - (overall_ssr / overall_tss)
    overall_mae = weighted_mae_sum / total_samples

    print(f'R^2: {overall_r2}')
    print(f'MAE: {overall_mae}')

    return models, mse_values, mae_values, selectors

# Example usage:
trained_models, mse_values, mae_values, selectors = train_and_evaluate(df)


R^2: 0.9898332108526456
MAE: 12.253244291154639


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFECV

def train_and_evaluate(df):
    # Split the entire dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

    y_mean = df_test['arrival_delay'].mean()

    # Dictionary to store models and selectors
    models = {}
    selectors = {}
    mse_values = {}
    mae_values = {}

    weighted_mae_sum = 0
    total_samples = 0
    overall_ssr = 0
    overall_tss = 0

    # Function to train a model with RFECV
    def train_model(df_train_subset):
        df_train_subset = df_train_subset.copy()
        x_train = df_train_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
        y_train = df_train_subset['arrival_delay']

        model = LinearRegression()
        selector = RFECV(estimator=model, step=1, cv=KFold(5))
        selector = selector.fit(x_train, y_train)
        
        # Fit model with selected features
        model.fit(x_train.iloc[:, selector.support_], y_train)
        return model, selector

    # Function to evaluate a model
    def evaluate_model(model, selector, df_test_subset):
        nonlocal weighted_mae_sum, total_samples, overall_ssr, overall_tss
        
        df_test_subset = df_test_subset.copy()
        x_test = df_test_subset.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1).iloc[:, selector.support_]
        y_test = df_test_subset['arrival_delay']
        
        y_pred = model.predict(x_test)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        sample_count = len(y_test)
        weighted_mae_sum += mae * sample_count
        total_samples += sample_count

        residuals = y_test - y_pred
        ssr = sum(residuals**2)
        overall_ssr += ssr
        tss = sum((y_test - y_mean)**2)
        overall_tss += tss

        return mse, mae

    # Train and evaluate model for weekdays
    df_weekday_train = df_train[df_train['day_of_week'] == 'weekday']
    models['weekday'], selectors['weekday'] = train_model(df_weekday_train)
    mse_values['weekday'], mae_values['weekday'] = evaluate_model(models['weekday'], selectors['weekday'], df_test[df_test['day_of_week'] == 'weekday'])

    # Train and evaluate model for weekends
    df_weekend_train = df_train[df_train['day_of_week'] == 'weekend']
    models['weekend'], selectors['weekend'] = train_model(df_weekend_train)
    mse_values['weekend'], mae_values['weekend'] = evaluate_model(models['weekend'], selectors['weekend'], df_test[df_test['day_of_week'] == 'weekend'])

    overall_r2 = 1 - (overall_ssr / overall_tss)
    overall_mae = weighted_mae_sum / total_samples

    print(f'R^2: {overall_r2}')
    print(f'MAE: {overall_mae}')

    return models, mse_values, mae_values, selectors

# Example usage:
# trained_models, mse_values, mae_values, selectors = train_and_evaluate(df)


In [19]:
trained_models, mse_values, mae_values, selectors = train_and_evaluate(df)
print(trained_models, mse_values, mae_values, selectors)

R^2: 0.9894137852157262
MAE: 12.57815776661747
{'weekday': LinearRegression(), 'weekend': LinearRegression()} {'weekday': 407.542846370922, 'weekend': 357.58299995041546} {'weekday': 13.061066358193196, 'weekend': 11.000269491156867} {'weekday': RFECV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
      estimator=LinearRegression()), 'weekend': RFECV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
      estimator=LinearRegression())}
