In [None]:
!pip install calplot pandas



In [None]:
import pandas as pd

#Plot
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import calendar
import calplot # actually used

# Score model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


# **Load and visualize data**

In [None]:
url = "https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/main/ProjectAssignmentData/Dataset-PT.csv"
df = pd.read_csv(url,header=1)


In [None]:
df.head(5)

# df.size 545104 rows
# from Jan 8
# June 30

In [None]:
# Use the nunique() function to count the number of unique values in the column
unique_count = df['bus_id'].nunique()

# Print the result
print(f"Number of different numbers in the column: {unique_count}")

In [None]:
max_delay_row = df[df['arrival_delay'] == df['arrival_delay'].max()]
print(max_delay_row)

In [None]:
df.info()

In [None]:
# Filter the data
df_filtered = df[df['arrival_delay'].between(-200, 1000)]

plt.figure(figsize=(12, 6))

# Violin plot
sns.violinplot(data=df_filtered, x='time_of_day', y='arrival_delay', hue='day_of_week', order=['OP', 'MP', 'AP'], hue_order=['weekday', 'weekend'], split=True, inner=None, palette="pastel")

# Point plot to indicate means
sns.pointplot(data=df_filtered, x='time_of_day', y='arrival_delay', hue='day_of_week', order=['OP', 'MP', 'AP'], hue_order=['weekday', 'weekend'], dodge=0.532, join=False, palette="dark", markers="D", scale=0.75, ci=None)

plt.title('Arrival Delay Distribution by Time of Day and Day of Week with Mean')
plt.legend(title='Day of Week')
plt.show()


In [None]:
print(df["weather"].unique())
print(df["temperature"].unique())

df = df.drop(columns='temperature')  # Remove it since we have them already
df = df.drop(columns='weather')  # Remove it since we have them already


In [None]:
print("routeid",df["route_id"].unique())
print("bus id",df["bus_id"].unique())

print("time of day",df["time_of_day"].unique())
print("day of week",df["day_of_week"].unique())

df = df.drop(columns='time_of_day')  # Remove it since we have them already
df = df.drop(columns='day_of_week')  # Remove it since we have them already



In [None]:
column_titles_list = df.columns.tolist()
print(column_titles_list)

In [None]:
# Set a theme for seaborn
sns.set_theme()
filtered_df = df[(df['arrival_delay'] >= -1000) & (df['arrival_delay'] <= 1000)]
# Plot the enhanced histogram
plt.figure(figsize=(5,3))
sns.histplot(filtered_df['arrival_delay'], bins=30, kde=True, color="#ff5722")
plt.title('Arrival Delay Histogram')
plt.xlabel('Arrival Delay')
plt.ylabel('Frequency')
plt.show()


In [None]:
snow_days_count = df['factor(weather)Snow'].sum()

print(f"Number of snow days: {snow_days_count}")

In [None]:
df.head()

In [None]:
# Convert 'Date' column to pandas datetime format
df['Date'] = pd.to_datetime(df['Calendar_date'], format='%Y%m%d')

# Set the 'Date' column as the index
df.set_index('Date', inplace=True)

# Plot using calplot
calplot.calplot(df['factor(temperature)Cold'], cmap='Blues', edgecolor='lightgray', suptitle="Cold Days", linewidth=1)


In [None]:
# Plot using calplot
calplot.calplot(df['arrival_delay'], cmap='Reds', edgecolor='lightgray', suptitle="Delay", linewidth=1)


#**Create a base line model (Mean)**
    Using the whole dataset and its features

In [None]:

# Assuming df is your DataFrame
# 1. Calculate the mean of the actual delays
mean_delay = df['arrival_delay'].mean()

# 2. Create a new column with the mean delay as prediction
df['predicted_delay'] = mean_delay

# 3. Calculate MSE
mse = mean_squared_error(df['arrival_delay'], df['predicted_delay'])
print(f"Mean Squared Error (MSE): {mse}")

# Calculate other metrics if required:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(df['arrival_delay'], df['predicted_delay'])
print(f"Mean Absolute Error (MAE): {mae}")

# R^2 Score (not very meaningful in this context, but can be used)
r2 = r2_score(df['arrival_delay'], df['predicted_delay'])
print(f"R^2 Score: {r2}")

# COMMENT FOR REPORT
# Since r^2 = 1- ssres / sstot
# ssres is actual - pred -> since predis mean denominator and numerator is the same
# sstot is actual - mean


### Feature engineering

In [None]:
df['mean_bus_delay'] = df.groupby('bus_id')['arrival_delay'].transform('mean')

### Regression

In [None]:
# Split the data into training and test sets (30% held out for testing)
X = df.drop('arrival_delay', axis=1)
y = df['arrival_delay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Define Ridge regression model
ridge = Ridge(alpha=1.0)  # Change alpha as needed

model = LinearRegression()

# Recursive feature elimination with cross-validation
selector = RFECV(estimator=ridge, step=1, cv=5)
selector = selector.fit(X_train, y_train)


# Select the important features based on RFECV
X_train_selected = selector.transform(X_train)

print("Number of best features: ", selector.n_features_)
print("Best features: ", X_train.columns[selector.support_])


In [None]:

ridge.fit(X_train, y_train)
predictions = ridge.predict(X_test)

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

# Calculate adjusted R^2
n = len(y_test)
k = X_test.shape[1]
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - k - 1)

print(f'Mean Squared Error (MSE): {mse}')
print(f'R-squared (R2 ): {r2}')
print(f'Adjusted R-squared: {adjusted_r2}')


### Decision Tree Regresson

In [None]:
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {r2}")


### Split weekend & weekday

In [None]:
# # Splitting the DataFrame into weekend and weekday DataFrames
# # For weekend
# df_weekend = df[df['factor(day_of_week)weekend'] == 1].copy()
# df_weekend.drop(columns=['factor(day_of_week)weekend', 'factor(day_of_week)weekday'], inplace=True)
# # For weekday
# df_weekday = df[df['factor(day_of_week)weekday'] == 1].copy()
# df_weekday.drop(columns=['factor(day_of_week)weekend', 'factor(day_of_week)weekday'], inplace=True)

# **Model 1: Weekday Model**
    Using only data during weekdays and specified particular variables

In [None]:
# Extract only 'Weekdays' from day_of_week to train
# Filter the data
df_weekday = df[df['day_of_week'] == 'weekday']

# **Model 2: Weekend Model**
    Using only data during weekend and specified particular variables