In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

import warnings

warnings.filterwarnings(action="ignore")
metro_dataset = pd.read_csv("../dataset/metro_passenger_flow_normal_rate.csv")
# Perform one-hot encoding on station_name and line_number columns
encoded_columns = pd.get_dummies(metro_dataset[['station_name', 'line_number']], dtype=int)

# Replace the original columns with the encoded columns
metro_dataset = pd.concat([metro_dataset.drop(['station_name', 'line_number'], axis=1), encoded_columns], axis=1, )

# Print the updated dataset
metro_dataset['is_crowed_station'] = metro_dataset['is_crowed_station'].astype(int)
metro_dataset
from datetime import datetime


def update_crowded_rate(timestamp):
    timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    if timestamp.hour > 6 and timestamp.hour < 8:
        return 0.8
    elif timestamp.hour >= 8 and timestamp.hour < 12:
        return 1.2
    elif timestamp.hour >= 12 and timestamp.hour < 16:
        return 1
    elif timestamp.hour >= 16 and timestamp.hour < 20:
        return 1.2
    else:
        return 0.8


metro_dataset['timestamp'] = metro_dataset['timestamp'].apply(update_crowded_rate)
train_data, test_data = train_test_split(metro_dataset, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Extract the target column (input_count)
train_target = train_data['input_count']
val_target = val_data['input_count']
test_target = test_data['input_count']
# Perform polynomial feature transformation
poly = PolynomialFeatures(degree=2)
train_data_poly = poly.fit_transform(train_data.drop('input_count', axis=1))
val_data_poly = poly.transform(val_data.drop('input_count', axis=1))
test_data_poly = poly.transform(test_data.drop('input_count', axis=1))
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(train_data_poly, train_target)
linear_predictions = linear_model.predict(val_data_poly)
linear_rmse = np.sqrt(mean_squared_error(val_target, linear_predictions))
linear_r2 = r2_score(val_target, linear_predictions)
# XGBoost
xgb_model = xgb.XGBRegressor()
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
xgb_grid = GridSearchCV(xgb_model, xgb_params, scoring='neg_root_mean_squared_error', cv=3)
xgb_grid.fit(train_data_poly, train_target)
xgb_model = xgb_grid.best_estimator_
xgb_predictions = xgb_model.predict(val_data_poly)
xgb_rmse = np.sqrt(mean_squared_error(val_target, xgb_predictions))
xgb_r2 = r2_score(val_target, xgb_predictions)
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
rf_grid = GridSearchCV(rf_model, rf_params, scoring='neg_root_mean_squared_error', cv=3)
rf_grid.fit(train_data_poly, train_target)
rf_model = rf_grid.best_estimator_
rf_predictions = rf_model.predict(val_data_poly)
rf_rmse = np.sqrt(mean_squared_error(val_target, rf_predictions))
rf_r2 = r2_score(val_target, rf_predictions)

# Save the models and their RMSE values
models = {
    'XGBoost': xgb_model,
    'Linear Regression': linear_model,
    'Random Forest Regression': rf_model
}

rmse_values = {
    'XGBoost': xgb_rmse,
    'Linear Regression': linear_rmse,
    'Random Forest Regression': rf_rmse
}

r2_scores = {
    'Linear Regression': linear_r2,
    'Random Forest Regression': rf_r2,
    'XGBoost': xgb_r2
}

# Print the RMSE values
for model_name, rmse in rmse_values.items():
    r2 = r2_scores[model_name]
    print(f"{model_name} RMSE: {rmse}")
    print(f"{model_name} R-squared: {r2}")

XGBoost RMSE: 4.189786217322767
XGBoost R-squared: 0.8494530582315127
Linear Regression RMSE: 4.14897281750039
Linear Regression R-squared: 0.8523717780563899
Random Forest Regression RMSE: 4.403256383005105
Random Forest Regression R-squared: 0.8337214775465287


In [39]:
poisson_input_estimation = pd.DataFrame(columns=['station_name', 'input_poisson'])
from dataset.stations import *
poisson_input_estimation_dict = {"station_name": [], "input_poisson": []}
for line_name, stations in Lines.items():
    for station in stations:
        s = "station_name_" + station
        df = metro_dataset[metro_dataset[s] == 1]
        poly_ = poly.transform(df.drop('input_count', axis=1))
        average_station = poly_.mean(axis=0).reshape(1, -1)
        input_poisson = rf_model.predict(average_station)
        poisson_input_estimation_dict['station_name'].append(s)
        poisson_input_estimation_dict['input_poisson'].append(input_poisson.item())
poisson_input_estimation_df = pd.DataFrame(poisson_input_estimation_dict)
poisson_input_estimation_df

Unnamed: 0,station_name,input_poisson
0,station_name_Tajrish,34.575453
1,station_name_Shariati,11.737076
2,station_name_Beheshti,11.737076
3,station_name_Dowlat,11.737076
4,station_name_Mohamadieh,11.737076
5,station_name_Kahrizak,0.0
6,station_name_Sanat,34.782065
7,station_name_Valiasr,11.737076
8,station_name_Theather,11.737076
9,station_name_Mohamadieh,11.737076


In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

import warnings

warnings.filterwarnings(action="ignore")
metro_dataset = pd.read_csv("../dataset/metro_passenger_flow_normal_rate.csv")
# Perform one-hot encoding on station_name and line_number columns
encoded_columns = pd.get_dummies(metro_dataset[['station_name', 'line_number']], dtype=int)

# Replace the original columns with the encoded columns
metro_dataset = pd.concat([metro_dataset.drop(['station_name', 'line_number'], axis=1), encoded_columns], axis=1, )

# Print the updated dataset
metro_dataset['is_crowed_station'] = metro_dataset['is_crowed_station'].astype(int)
metro_dataset
from datetime import datetime


def update_crowded_rate(timestamp):
    timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    if timestamp.hour > 6 and timestamp.hour < 8:
        return 0.8
    elif timestamp.hour >= 8 and timestamp.hour < 12:
        return 1.2
    elif timestamp.hour >= 12 and timestamp.hour < 16:
        return 1
    elif timestamp.hour >= 16 and timestamp.hour < 20:
        return 1.2
    else:
        return 0.8


metro_dataset['timestamp'] = metro_dataset['timestamp'].apply(update_crowded_rate)
train_data, test_data = train_test_split(metro_dataset, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Extract the target column (output_count)
train_target = train_data['output_count']
val_target = val_data['output_count']
test_target = test_data['output_count']
# Perform polynomial feature transformation
poly = PolynomialFeatures(degree=2)
train_data_poly = poly.fit_transform(train_data.drop('output_count', axis=1))
val_data_poly = poly.transform(val_data.drop('output_count', axis=1))
test_data_poly = poly.transform(test_data.drop('output_count', axis=1))
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(train_data_poly, train_target)
linear_predictions = linear_model.predict(val_data_poly)
linear_rmse = np.sqrt(mean_squared_error(val_target, linear_predictions))
linear_r2 = r2_score(val_target, linear_predictions)
# XGBoost
xgb_model = xgb.XGBRegressor()
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
xgb_grid = GridSearchCV(xgb_model, xgb_params, scoring='neg_root_mean_squared_error', cv=3)
xgb_grid.fit(train_data_poly, train_target)
xgb_model = xgb_grid.best_estimator_
xgb_predictions = xgb_model.predict(val_data_poly)
xgb_rmse = np.sqrt(mean_squared_error(val_target, xgb_predictions))
xgb_r2 = r2_score(val_target, xgb_predictions)
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
rf_grid = GridSearchCV(rf_model, rf_params, scoring='neg_root_mean_squared_error', cv=3)
rf_grid.fit(train_data_poly, train_target)
rf_model = rf_grid.best_estimator_
rf_predictions = rf_model.predict(val_data_poly)
rf_rmse = np.sqrt(mean_squared_error(val_target, rf_predictions))
rf_r2 = r2_score(val_target, rf_predictions)

# Save the models and their RMSE values
models = {
    'XGBoost': xgb_model,
    'Linear Regression': linear_model,
    'Random Forest Regression': rf_model
}

rmse_values = {
    'XGBoost': xgb_rmse,
    'Linear Regression': linear_rmse,
    'Random Forest Regression': rf_rmse
}

r2_scores = {
    'Linear Regression': linear_r2,
    'Random Forest Regression': rf_r2,
    'XGBoost': xgb_r2
}

# Print the RMSE values
for model_name, rmse in rmse_values.items():
    r2 = r2_scores[model_name]
    print(f"{model_name} RMSE: {rmse}")
    print(f"{model_name} R-squared: {r2}")

XGBoost RMSE: 5.549399328741542
XGBoost R-squared: 0.6452154635011762
Linear Regression RMSE: 5.418928775386373
Linear Regression R-squared: 0.6617018571619979
Random Forest Regression RMSE: 5.469949181067125
Random Forest Regression R-squared: 0.6553015654935451


In [41]:
from dataset.stations import *
poisson_input_estimation_dict = {"station_name": [], "output_poisson": []}
for line_name, stations in Lines.items():
    for station in stations:
        s = "station_name_" + station
        df = metro_dataset[metro_dataset[s] == 1]
        poly_ = poly.transform(df.drop('output_count', axis=1))
        average_station = poly_.mean(axis=0).reshape(1, -1)
        output_poisson = rf_model.predict(average_station)
        poisson_input_estimation_dict['station_name'].append(s)
        poisson_input_estimation_dict['output_poisson'].append(output_poisson.item())
poisson_output_estimation = pd.DataFrame(poisson_input_estimation_dict)
poisson_output_estimation

Unnamed: 0,station_name,output_poisson
0,station_name_Tajrish,0.027595
1,station_name_Shariati,9.489689
2,station_name_Beheshti,16.942271
3,station_name_Dowlat,17.068766
4,station_name_Mohamadieh,17.047023
5,station_name_Kahrizak,22.092112
6,station_name_Sanat,0.027595
7,station_name_Valiasr,17.256691
8,station_name_Theather,17.082848
9,station_name_Mohamadieh,17.047023
