In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from datetime import datetime

import warnings

warnings.filterwarnings(action="ignore")
metro_dataset = pd.read_csv("../dataset/metro_passenger_flow.csv")
# Perform one-hot encoding on station_name and line_number columns
encoded_columns = pd.get_dummies(metro_dataset[['station_name', 'line_number']], dtype=int)

# Replace the original columns with the encoded columns
metro_dataset = pd.concat([metro_dataset.drop(['station_name', 'line_number'], axis=1), encoded_columns], axis=1, )

In [3]:
train_status = pd.read_csv("../dataset/train_status.csv")
sum_passenger = train_status.groupby('current_timestamp')['current_passenger'].sum()

Unnamed: 0,timestamp,input_count,output_count,crowed_time_rate,is_crowed_station,is_weekend,is_holiday,station_name_Azadi,station_name_Basij,station_name_Beheshti,...,station_name_Shariati,station_name_Shemiran,station_name_Tajrish,station_name_Theather,station_name_Valiasr,station_name_ferdowsi,line_number_Line1,line_number_Line2,line_number_Line3,line_number_Line4
0,2024-01-01 06:00:00,31,0,0.8,True,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,2024-01-01 06:06:00,9,6,0.8,False,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2,2024-01-01 06:12:00,5,10,0.8,False,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,2024-01-01 06:18:00,11,12,0.8,True,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2024-01-01 06:24:00,18,17,0.8,True,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3739,2024-01-01 21:36:00,8,9,0.8,True,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3740,2024-01-01 21:42:00,10,12,0.8,False,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3741,2024-01-01 21:48:00,5,19,0.8,True,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3742,2024-01-01 21:54:00,10,11,0.8,False,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [23]:
grouped_metro_data = metro_dataset.groupby('timestamp').agg({
    'input_count': 'sum',
    'output_count': 'sum',
    'crowed_time_rate': 'mean',
    'is_weekend': 'mean',
    'is_holiday': 'mean'
})

In [31]:
sum_passenger_df = sum_passenger.reset_index().rename(columns={'current_timestamp': 'timestamp',})
# Merge the 'sum_passenger_df' DataFrame with the 'grouped_metro_data' DataFrame based on the 'timestamp' column
columns_to_keep = ['timestamp', 'input_count', 'output_count', 'crowed_time_rate', 'is_weekend', 'is_holiday', 'current_passenger']
merged_data = pd.merge(grouped_metro_data, sum_passenger_df, on='timestamp')
passenger_dataset = merged_data[columns_to_keep]

In [35]:
from datetime import datetime
def update_crowded_rate(timestamp):
    timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    if timestamp.hour > 6 and timestamp.hour < 8:
        return 0.8
    elif timestamp.hour >= 8 and timestamp.hour < 12:
        return 1.2
    elif timestamp.hour >= 12 and timestamp.hour < 16:
        return 1
    elif timestamp.hour >= 16 and timestamp.hour < 20:
        return 1.2
    else:
        return 0.8

passenger_dataset['timestamp'] = passenger_dataset['timestamp'].apply(update_crowded_rate)

In [36]:
train_data, test_data = train_test_split(passenger_dataset, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Extract the target column (current_in_line_passengers)
train_target = train_data['current_passenger']
val_target = val_data['current_passenger']
test_target = test_data['current_passenger']
# Perform polynomial feature transformation
poly = PolynomialFeatures(degree=2)
train_data_poly = poly.fit_transform(train_data.drop('current_passenger', axis=1))
val_data_poly = poly.transform(val_data.drop('current_passenger', axis=1))
test_data_poly = poly.transform(test_data.drop('current_passenger', axis=1))
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(train_data_poly, train_target)
linear_predictions = linear_model.predict(val_data_poly)
linear_rmse = np.sqrt(mean_squared_error(val_target, linear_predictions))
linear_r2 = r2_score(val_target, linear_predictions)
# XGBoost
xgb_model = xgb.XGBRegressor()
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
xgb_grid = GridSearchCV(xgb_model, xgb_params, scoring='neg_root_mean_squared_error', cv=3)
xgb_grid.fit(train_data_poly, train_target)
xgb_model = xgb_grid.best_estimator_
xgb_predictions = xgb_model.predict(val_data_poly)
xgb_rmse = np.sqrt(mean_squared_error(val_target, xgb_predictions))
xgb_r2 = r2_score(val_target, xgb_predictions)
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
rf_grid = GridSearchCV(rf_model, rf_params, scoring='neg_root_mean_squared_error', cv=3)
rf_grid.fit(train_data_poly, train_target)
rf_model = rf_grid.best_estimator_
rf_predictions = rf_model.predict(val_data_poly)
rf_rmse = np.sqrt(mean_squared_error(val_target, rf_predictions))
rf_r2 = r2_score(val_target, rf_predictions)

# Save the models and their RMSE values
models = {
    'XGBoost': xgb_model,
    'Linear Regression': linear_model,
    'Random Forest Regression': rf_model
}

rmse_values = {
    'XGBoost': xgb_rmse,
    'Linear Regression': linear_rmse,
    'Random Forest Regression': rf_rmse
}

r2_scores = {
    'Linear Regression': linear_r2,
    'Random Forest Regression': rf_r2,
    'XGBoost': xgb_r2
}

# Print the RMSE values
for model_name, rmse in rmse_values.items():
    r2 = r2_scores[model_name]
    print(f"{model_name} RMSE: {rmse}")
    print(f"{model_name} R-squared: {r2}")

XGBoost RMSE: 51.65299131592512
XGBoost R-squared: 0.8486811157062791
Linear Regression RMSE: 53.612635848832205
Linear Regression R-squared: 0.8369816492029062
Random Forest Regression RMSE: 48.36787373133901
Random Forest Regression R-squared: 0.8673167294790962


In [37]:
import pickle
file_name = "current_passengers_model_xgb.pkl"
# save
pickle.dump(xgb_model, open(file_name, "wb"))