In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from datetime import datetime

import warnings

warnings.filterwarnings(action="ignore")
metro_dataset = pd.read_csv("../dataset/metro_passenger_flow.csv")
# Perform one-hot encoding on station_name and line_number columns
encoded_columns = pd.get_dummies(metro_dataset[['station_name', 'line_number']], dtype=int)

# Replace the original columns with the encoded columns
metro_dataset = pd.concat([metro_dataset.drop(['station_name', 'line_number'], axis=1), encoded_columns], axis=1, )

# Print the updated dataset
metro_dataset['is_crowed_station'] = metro_dataset['is_crowed_station'].astype(int)

def update_crowded_rate(timestamp):
    timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    if timestamp.hour > 6 and timestamp.hour < 8:
        return 0.8
    elif timestamp.hour >= 8 and timestamp.hour < 12:
        return 1.2
    elif timestamp.hour >= 12 and timestamp.hour < 16:
        return 1
    elif timestamp.hour >= 16 and timestamp.hour < 20:
        return 1.2
    else:
        return 0.8


metro_dataset['timestamp'] = metro_dataset['timestamp'].apply(update_crowded_rate)
train_data, test_data = train_test_split(metro_dataset, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Extract the target column (current_in_line_passengers)
train_target = train_data['current_in_line_passengers']
val_target = val_data['current_in_line_passengers']
test_target = test_data['current_in_line_passengers']
# Perform polynomial feature transformation
poly = PolynomialFeatures(degree=2)
train_data_poly = poly.fit_transform(train_data.drop('current_in_line_passengers', axis=1))
val_data_poly = poly.transform(val_data.drop('current_in_line_passengers', axis=1))
test_data_poly = poly.transform(test_data.drop('current_in_line_passengers', axis=1))
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(train_data_poly, train_target)
linear_predictions = linear_model.predict(val_data_poly)
linear_rmse = np.sqrt(mean_squared_error(val_target, linear_predictions))
linear_r2 = r2_score(val_target, linear_predictions)
# XGBoost
xgb_model = xgb.XGBRegressor()
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
xgb_grid = GridSearchCV(xgb_model, xgb_params, scoring='neg_root_mean_squared_error', cv=3)
xgb_grid.fit(train_data_poly, train_target)
xgb_model = xgb_grid.best_estimator_
xgb_predictions = xgb_model.predict(val_data_poly)
xgb_rmse = np.sqrt(mean_squared_error(val_target, xgb_predictions))
xgb_r2 = r2_score(val_target, xgb_predictions)
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
rf_grid = GridSearchCV(rf_model, rf_params, scoring='neg_root_mean_squared_error', cv=3)
rf_grid.fit(train_data_poly, train_target)
rf_model = rf_grid.best_estimator_
rf_predictions = rf_model.predict(val_data_poly)
rf_rmse = np.sqrt(mean_squared_error(val_target, rf_predictions))
rf_r2 = r2_score(val_target, rf_predictions)

# Save the models and their RMSE values
models = {
    'XGBoost': xgb_model,
    'Linear Regression': linear_model,
    'Random Forest Regression': rf_model
}

rmse_values = {
    'XGBoost': xgb_rmse,
    'Linear Regression': linear_rmse,
    'Random Forest Regression': rf_rmse
}

r2_scores = {
    'Linear Regression': linear_r2,
    'Random Forest Regression': rf_r2,
    'XGBoost': xgb_r2
}

# Print the RMSE values
for model_name, rmse in rmse_values.items():
    r2 = r2_scores[model_name]
    print(f"{model_name} RMSE: {rmse}")
    print(f"{model_name} R-squared: {r2}")

XGBoost RMSE: 8.099963660691428
XGBoost R-squared: 0.7433607399686464
Linear Regression RMSE: 7.695367968727143
Linear Regression R-squared: 0.7683588373855125
Random Forest Regression RMSE: 8.374192732173674
Random Forest Regression R-squared: 0.725689231175915


In [2]:
import pickle
file_name = "current_passengers_model_xgb.pkl"
# save
pickle.dump(xgb_model, open(file_name, "wb"))