In [1]:
import pandas as pd

#Plot
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import calendar 
import calplot # actually used

# Score model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Model
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from xgboost import XGBRegressor





In [2]:
url = "https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/main/ProjectAssignmentData/Dataset-PT.csv"
df = pd.read_csv(url,header=1)
df.head(5)
# df.size 545104 rows

Unnamed: 0,Calendar_date,route_id,bus_id,stop_sequence,arrival_delay,dwell_time,travel_time_for_previous_section,scheduled_travel_time,upstream_stop_delay,origin_delay,...,factor(weather)Rain,factor(weather)Snow,factor(temperature)Cold,factor(temperature)Extra_cold,factor(temperature)Normal,factor(day_of_week)weekday,factor(day_of_week)weekend,factor(time_of_day)Afternoon_peak,factor(time_of_day)Morning_peak,factor(time_of_day)Off-peak
0,20220108,4,41344,1,151,0,0,120,100,100,...,0,0,0,0,1,0,1,0,0,1
1,20220108,4,41344,2,185,24,171,45,151,100,...,0,0,0,0,1,0,1,0,0,1
2,20220108,4,41344,3,186,0,55,41,185,100,...,0,0,0,0,1,0,1,0,0,1
3,20220108,4,41344,4,202,12,42,94,186,100,...,0,0,0,0,1,0,1,0,0,1
4,20220108,4,41344,5,242,21,98,86,202,100,...,0,0,0,0,1,0,1,0,0,1


In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

def stop_model(df):
    weighted_mae_sum=0
    total_samples=0
    overall_ssr=0
    overall_tss=0
    # Split the entire dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
    y_mean = df_test['arrival_delay'].mean()
    # Dictionary to store models
    models = {}
    mse_values = {}
    mae_values = {}

    for i in range(1, 28):

        desired_stop_sequence = i
        mask_train = df_train['stop_sequence'] == desired_stop_sequence
        df_stop_train = df_train[mask_train]

        df_stop_weekday_train = df_stop_train[df_stop_train['day_of_week'] == 'weekday'].copy()

        subset_M1_RG_train = df_stop_weekday_train[['traffic_condition', 'recurrent_delay']].copy()
        subset_M1_RG_train['traffic_condition'] = subset_M1_RG_train['traffic_condition'].astype(int)
        subset_M1_RG_train['recurrent_delay'] = subset_M1_RG_train['recurrent_delay'].astype(int)

        df_stop_weekday_train.loc[:, ['traffic_condition', 'recurrent_delay']] = subset_M1_RG_train.values
        x_train = df_stop_weekday_train.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
        y_train = df_stop_weekday_train['arrival_delay']

        # Train the model
        model = LinearRegression()
        model.fit(x_train, y_train)

        # Store the trained model in the dictionary
        models[desired_stop_sequence] = model

        # Now predict and evaluate on test data specific to this stop
        mask_test = df_test['stop_sequence'] == desired_stop_sequence
        df_stop_test = df_test[mask_test]

        df_stop_weekday_test = df_stop_test[df_stop_test['day_of_week'] == 'weekday'].copy()

        subset_M1_RG_test = df_stop_weekday_test[['traffic_condition', 'recurrent_delay']].copy()
        subset_M1_RG_test['traffic_condition'] = subset_M1_RG_test['traffic_condition'].astype(int)
        subset_M1_RG_test['recurrent_delay'] = subset_M1_RG_test['recurrent_delay'].astype(int)

        df_stop_weekday_test.loc[:, ['traffic_condition', 'recurrent_delay']] = subset_M1_RG_test.values
        x_test = df_stop_weekday_test.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
        y_test = df_stop_weekday_test['arrival_delay']
        
        
        y_pred = model.predict(x_test)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        mse_values[desired_stop_sequence] = mse
        mae_values[desired_stop_sequence] = mae

        #print(f'Stop Sequence {i} - MSE: {mse}, MAE: {mae}')
        sample_count = len(y_test)
        weighted_mae_sum += mae * sample_count
        
        total_samples += sample_count
        residuals = y_test - y_pred
        ssr = sum(residuals**2)
        tss = sum((y_test - y_mean)**2)
        
        overall_ssr += ssr
        overall_tss += tss

    overall_r2 = 1 - (overall_ssr / overall_tss)
    overall_mae = weighted_mae_sum / total_samples

    print(f'R^2: {overall_r2}')
    print(f'MAE: {overall_mae}')
    return models, mse_values, mae_values

# Example usage:
# trained_models, mse_values, mae_values = train_and_evaluate(df)


In [4]:
# After training...
trained_models, mse_values, mae_values = stop_model(df)

# Save the models to a file
import joblib
joblib.dump(trained_models, 'all_stop_model.pkl')


R^2: 0.9881725458416755
MAE: 12.742722547238778


['all_stop_model.pkl']

In [5]:
import joblib

def predict_for_stop(input_data, stop_sequence):

    # Load the models
    loaded_models = joblib.load('trained_models.pkl')

    # Ensure the model for the given stop_sequence exists
    if stop_sequence not in loaded_models:
        raise ValueError(f"No model trained for stop sequence {stop_sequence}")

    # Predict using the model for the specific stop_sequence
    predictions = loaded_models[stop_sequence].predict(input_data)

    return predictions

# Example usage:
# Let's say you have a DataFrame 'new_data' for stop_sequence 5:
# predicted_delays = predict_for_stop(new_data, 5)


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import numpy as np

def quantile_model(df):
    weighted_mae_sum = 0
    total_samples = 0
    overall_ssr = 0
    overall_tss = 0
    
    # Split the entire dataset into training and test sets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
    y_mean = df_test['arrival_delay'].mean()
    
    # Dictionary to store models
    models = {}
    mse_values = {}
    mae_values = {}

    # Get the mean delay for each stop
    mean_delays = df.groupby('stop_sequence')['arrival_delay'].mean()

    # Divide stops into four quantiles based on mean delay
    quantiles = mean_delays.quantile([0.25, 0.5, 0.75, 1.0]).values

    prev_q = 0.0
    for q in quantiles:
        stops_in_quantile = mean_delays[(mean_delays <= q) & (mean_delays > prev_q)].index.tolist()

        # Aggregate training data for all stops in the quantile
        mask_train = df_train['stop_sequence'].isin(stops_in_quantile)
        df_quantile_train = df_train[mask_train]

        df_quantile_weekday_train = df_quantile_train[df_quantile_train['day_of_week'] == 'weekday'].copy()
        subset_M1_RG_train = df_quantile_weekday_train[['traffic_condition', 'recurrent_delay']]
        subset_M1_RG_train['traffic_condition'] = subset_M1_RG_train['traffic_condition'].astype(int)
        subset_M1_RG_train['recurrent_delay'] = subset_M1_RG_train['recurrent_delay'].astype(int)
        df_quantile_weekday_train.loc[:, ['traffic_condition', 'recurrent_delay']] = subset_M1_RG_train.values

        x_train = df_quantile_weekday_train.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
        y_train = df_quantile_weekday_train['arrival_delay']

        # Train the model for the quantile
        model = LinearRegression()
        model.fit(x_train, y_train)

        # Store the trained model in the dictionary
        quantile_label = f"{prev_q}-{q}"
        models[quantile_label] = model

        # Aggregate test data for all stops in the quantile
        mask_test = df_test['stop_sequence'].isin(stops_in_quantile)
        df_quantile_test = df_test[mask_test]

        df_quantile_weekday_test = df_quantile_test[df_quantile_test['day_of_week'] == 'weekday'].copy()
        subset_M1_RG_test = df_quantile_weekday_test[['traffic_condition', 'recurrent_delay']]
        subset_M1_RG_test['traffic_condition'] = subset_M1_RG_test['traffic_condition'].astype(int)
        subset_M1_RG_test['recurrent_delay'] = subset_M1_RG_test['recurrent_delay'].astype(int)
        df_quantile_weekday_test.loc[:, ['traffic_condition', 'recurrent_delay']] = subset_M1_RG_test.values

        x_test = df_quantile_weekday_test.drop(['arrival_delay', 'day_of_week', 'time_of_day', "weather", "temperature"], axis=1)
        y_test = df_quantile_weekday_test['arrival_delay']
        
        y_pred = model.predict(x_test)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        mse_values[quantile_label] = mse
        mae_values[quantile_label] = mae

        # Weighted MAE computation
        sample_count = len(y_test)
        weighted_mae_sum += mae * sample_count
        total_samples += sample_count
        
        # R^2 computation
        residuals = y_test - y_pred
        ssr = sum(residuals**2)
        tss = sum((y_test - y_mean)**2)
        overall_ssr += ssr
        overall_tss += tss

        # Update for the next iteration
        prev_q = q

    # Compute overall metrics
    overall_r2 = 1 - (overall_ssr / overall_tss)
    overall_mae = weighted_mae_sum / total_samples
    print(f'R^2: {overall_r2}')
    print(f'MAE: {overall_mae}')
    
    return models, mse_values, mae_values

# Example usage:
# trained_models, mse_values, mae_values = quantile_model(df)


In [14]:
# After training...
trained_models, mse_values, mae_values = quantile_model(df)
print(trained_models)
# Save the models to a file
joblib.dump(trained_models, 'quantile_model.pkl')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_M1_RG_train['traffic_condition'] = subset_M1_RG_train['traffic_condition'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_M1_RG_train['recurrent_delay'] = subset_M1_RG_train['recurrent_delay'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_M1_RG_test['tr

R^2: 0.9877781599008366
MAE: 13.016567759493558
{'0.0-106.82042201198672': LinearRegression(), '106.82042201198672-138.43167071177373': LinearRegression(), '138.43167071177373-158.43756501064937': LinearRegression(), '158.43756501064937-208.88776066174648': LinearRegression()}


['quantile_model.pkl']

In [21]:
# Dropping redundant or non-informative columns
columns_to_drop = ['weather', 'temperature', 'day_of_week', 'time_of_day']
df = df.drop(columns=columns_to_drop)

# Defining features and target
X = df.drop('arrival_delay', axis=1)
y = df['arrival_delay']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error for Linear Regression: {mae}")
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)
predictions_rf = model_rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, predictions_rf)
print(f"Mean Absolute Error for Random Forest: {mae_rf}")
import xgboost as xgb

model_xgb = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7)
model_xgb.fit(X_train,y_train)
predictions_xgb = model_xgb.predict(X_test)

mae_xgb = mean_absolute_error(y_test, predictions_xgb)
print(f"Mean Absolute Error for XGBoost: {mae_xgb}")


Mean Absolute Error for Linear Regression: 12.586594498676495
Mean Absolute Error for Random Forest: 12.989709780684455
Mean Absolute Error for XGBoost: 12.697480997451464
