In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings(action="ignore")

In [15]:
metro_dataset = pd.read_csv("../dataset/metro_passenger_flow.csv")
# Perform one-hot encoding on station_name and line_number columns
encoded_columns = pd.get_dummies(metro_dataset[['station_name', 'line_number']], dtype=int)

# Replace the original columns with the encoded columns
metro_dataset = pd.concat([metro_dataset.drop(['station_name', 'line_number'], axis=1), encoded_columns], axis=1,)

# Print the updated dataset
metro_dataset['is_crowed_station'] = metro_dataset['is_crowed_station'].astype(int)

In [16]:
metro_dataset

Unnamed: 0,timestamp,input_count,output_count,current_in_line_passengers,crowed_time_rate,is_crowed_station,is_weekend,is_holiday,station_name_Azadi,station_name_Basij,...,station_name_Shariati,station_name_Shemiran,station_name_Tajrish,station_name_Theather,station_name_Valiasr,station_name_ferdowsi,line_number_Line1,line_number_Line2,line_number_Line3,line_number_Line4
0,2024-01-01 06:00:00,27,0,27,0.8,1,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0
1,2024-01-01 06:06:00,15,4,38,0.8,0,1,1,0,0,...,1,0,0,0,0,0,1,0,0,0
2,2024-01-01 06:12:00,12,14,36,0.8,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2024-01-01 06:18:00,6,14,28,0.8,1,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2024-01-01 06:24:00,8,19,17,0.8,1,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4747,2024-01-01 22:06:00,8,15,30,0.8,1,1,1,0,0,...,0,0,0,1,0,0,0,0,0,1
4748,2024-01-01 22:12:00,13,8,35,0.8,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,1
4749,2024-01-01 22:18:00,8,7,36,0.8,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4750,2024-01-01 22:24:00,9,9,36,0.8,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,1


In [31]:
from datetime import datetime
def update_crowded_rate(timestamp):
    timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    if timestamp.hour > 6 and timestamp.hour < 8:
        return 0.8
    elif timestamp.hour >= 8 and timestamp.hour < 12:
        return 1.2
    elif timestamp.hour >= 12 and timestamp.hour < 16:
        return 1
    elif timestamp.hour >= 16 and timestamp.hour < 20:
        return 1.2
    else:
        return 0.8

metro_dataset['timestamp'] = metro_dataset['timestamp'].apply(update_crowded_rate)

In [35]:
train_data, test_data = train_test_split(metro_dataset, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Extract the target column (input_count)
train_target = train_data['input_count']
val_target = val_data['input_count']
test_target = test_data['input_count']

In [36]:
# Perform polynomial feature transformation
poly = PolynomialFeatures(degree=2)
train_data_poly = poly.fit_transform(train_data.drop('input_count', axis=1))
val_data_poly = poly.transform(val_data.drop('input_count', axis=1))
test_data_poly = poly.transform(test_data.drop('input_count', axis=1))

In [None]:
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(train_data_poly, train_target)
linear_predictions = linear_model.predict(val_data_poly)
linear_rmse = np.sqrt(mean_squared_error(val_target, linear_predictions))

In [39]:
# XGBoost
xgb_model = xgb.XGBRegressor()
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
xgb_grid = GridSearchCV(xgb_model, xgb_params, scoring='neg_root_mean_squared_error', cv=3)
xgb_grid.fit(train_data_poly, train_target)
xgb_model = xgb_grid.best_estimator_
xgb_predictions = xgb_model.predict(val_data_poly)
xgb_rmse = np.sqrt(mean_squared_error(val_target, xgb_predictions))

In [None]:
# SVM
svm_model = SVR()
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}
svm_grid = GridSearchCV(svm_model, svm_params, scoring='neg_root_mean_squared_error', cv=3)
svm_grid.fit(train_data_poly, train_target)
svm_model = svm_grid.best_estimator_
svm_predictions = svm_model.predict(val_data_poly)
svm_rmse = np.sqrt(mean_squared_error(val_target, svm_predictions))

In [None]:
# KNN Regression
knn_model = KNeighborsRegressor()
knn_params = {
    'n_neighbors': [3, 5, 7]
}
knn_grid = GridSearchCV(knn_model, knn_params, scoring='neg_root_mean_squared_error', cv=3)
knn_grid.fit(train_data_poly, train_target)
knn_model = knn_grid.best_estimator_
knn_predictions = knn_model.predict(val_data_poly)
knn_rmse = np.sqrt(mean_squared_error(val_target, knn_predictions))

In [None]:
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
rf_grid = GridSearchCV(rf_model, rf_params, scoring='neg_root_mean_squared_error', cv=3)
rf_grid.fit(train_data_poly, train_target)
rf_model = rf_grid.best_estimator_
rf_predictions = rf_model.predict(val_data_poly)
rf_rmse = np.sqrt(mean_squared_error(val_target, rf_predictions))

In [32]:


# Save the models and their RMSE values
models = {
    'XGBoost': xgb_model,
    'SVM': svm_model,
    'Linear Regression': linear_model,
    'KNN Regression': knn_model,
    'Random Forest Regression': rf_model
}

rmse_values = {
    'XGBoost': xgb_rmse,
    'SVM': svm_rmse,
    'Linear Regression': linear_rmse,
    'KNN Regression': knn_rmse,
    'Random Forest Regression': rf_rmse
}

# Print the RMSE values
for model_name, rmse in rmse_values.items():
    print(f"{model_name} RMSE: {rmse}")

# Save the models and their RMSE values
for model_name, model in models.items():
    model.save_model(f"{model_name}.model")

Unnamed: 0,timestamp,input_count,output_count,current_in_line_passengers,crowed_time_rate,is_crowed_station,is_weekend,is_holiday,station_name_Azadi,station_name_Basij,...,station_name_Shariati,station_name_Shemiran,station_name_Tajrish,station_name_Theather,station_name_Valiasr,station_name_ferdowsi,line_number_Line1,line_number_Line2,line_number_Line3,line_number_Line4
0,0.8,27,0,27,0.8,1,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0
1,0.8,15,4,38,0.8,0,1,1,0,0,...,1,0,0,0,0,0,1,0,0,0
2,0.8,12,14,36,0.8,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.8,6,14,28,0.8,1,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.8,8,19,17,0.8,1,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4747,0.8,8,15,30,0.8,1,1,1,0,0,...,0,0,0,1,0,0,0,0,0,1
4748,0.8,13,8,35,0.8,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,1
4749,0.8,8,7,36,0.8,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4750,0.8,9,9,36,0.8,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,1
