Analysis of performance(training, and predicion time), and accuracy of models(R^2, MAE, Cross-validation, and RMSE).

In [None]:
import time
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import pickle


# Data preparation
star_data=time.time()
path = "./../data/processed/1_analysis_rent.csv"
data = pd.read_csv(path)
data = data.drop(columns=["id", "Unnamed: 0", "latitude", "longitude", "Period"])
data = data[data["city"] == "wroclaw"]
data = pd.get_dummies(data, drop_first=True)
x = data.drop(columns="price")
y = data["price"]
end_data=time.time()

# Models preparation
star_preparation=time.time()
data = pd.get_dummies(data, drop_first=True)
x = data.drop(columns="price")
y = data["price"]

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=99
)
end_preparation=time.time()

# Linear Regresion
start_LR=time.time()
model_LR = LinearRegression()
model_LR.fit(x_train, y_train)
end_LR=time.time()

# Random Forest
start_RF=time.time()
model_RF = RandomForestRegressor(n_estimators=100, random_state=99, max_depth=15,min_samples_leaf=1,min_samples_split=2)
model_RF.fit(x_train, y_train)
end_RF=time.time()  

# XGBoost
start_XGB=time.time()
model_XGB = XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=7, random_state=99, subsample=0.6)
model_XGB.fit(x_train, y_train)
end_XGB=time.time()

# Training time
print("Data preparation time: ",end_preparation-star_preparation)
print("Model preparation time: ",end_preparation-star_preparation)
# print("Linear Regresion Training Time: ", end_LR-start_LR)
# print("Random Forest training time: ",end_RF-start_RF)
# print("XGBoost training time: ",end_XGB-start_XGB)

#Prediciton time
start_LR_prediction=time.time()
y_pred_LR=model_LR.predict(x_test)
end_LR_prediction=time.time()

start_RF_prediction=time.time()
y_pred_RF=model_RF.predict(x_test)
end_RF_prediction=time.time()

start_XGB_prediction=time.time()
y_pred_XGB=model_XGB.predict(x_test)
end_XGB_prediction=time.time()

# print("Linear Regresion Predition Time: ", end_LR_prediction-start_LR_prediction)
# print("Random Forest Predition time: ",end_RF_prediction-start_RF_prediction)
# print("XGBoost Predition time: ",end_XGB_prediction-start_XGB_prediction)


r2_LR = r2_score(y_test, y_pred_LR)
r2_RF = r2_score(y_test, y_pred_RF)
r2_XGB = r2_score(y_test, y_pred_XGB)

scores_LR = cross_val_score(model_LR, x, y, cv=5, scoring="r2")
scores_RF = cross_val_score(model_RF, x, y, cv=5, scoring="r2")
scores_XGB = cross_val_score(model_RF, x, y, cv=5, scoring="r2")

rmse_LR = np.sqrt(mean_squared_error(y_test, y_pred_LR))
rmse_RF = np.sqrt(mean_squared_error(y_test, y_pred_RF))
rmse_XGB = np.sqrt(mean_squared_error(y_test, y_pred_XGB))

mae_LR = mean_absolute_error(y_test, y_pred_LR)
mae_RF = mean_absolute_error(y_test, y_pred_RF)
mae_XGB = mean_absolute_error(y_test, y_pred_XGB)


data = {
    "Model": ["Linear Regression", "Random Forest", "XGBoost"],
    "R² Score": [r2_LR, r2_RF, r2_XGB],
    "RMSE (PLN)": [rmse_LR, rmse_RF, rmse_XGB],
    "Cross-Validation": [np.mean(scores_LR),np.mean(scores_RF),np.mean(scores_XGB)],
    "Training Time (s)": [(end_LR-start_LR),( end_RF-start_RF),(end_XGB-start_XGB)],
    "Prediction Time (s)": [(end_LR_prediction-start_LR_prediction), (end_RF_prediction-start_RF_prediction), (end_XGB_prediction-start_XGB_prediction)],
    "MAE": [mae_LR,mae_RF,mae_XGB]

}

model_comparison=pd.DataFrame(data)
model_comparison=model_comparison.round(4)
model_comparison.to_csv("./../reports/ML_models_comparison.csv")

with open("./../ML_models/LR_model.pkl", "wb") as f:
    pickle.dump(model_LR, f)
with open("./../ML_models/RF_model.pkl", "wb") as f:
    pickle.dump(model_RF, f)
with open("./../ML_models/XGB_model.pkl", "wb") as f:
    pickle.dump(model_XGB, f)


Data preparation time:  0.0
Model preparation time:  0.0
