In [1]:
import sys
import os

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
sys.path.append(parent_dir)

import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from typing import Callable, List, Tuple

from data_processing.dtype_mapping import dtype_mapping
from training.training_process import training_process
from models.price_evaluator_xgboost_regression import (
    PriceRegressorXGBoostModel,
    PriceRegressorXGBoostModelHyperparams
)
from evaluation.evaluate_classification import evaluate_classification

from utils.classify import classify
from utils.classification_range_generator import generate_price_intervals
from utils.class_reduction import class_reduction


In [2]:
training_process(
    model_name="xgboost_regressor",
    model_class=PriceRegressorXGBoostModel,
    hyperparameters_class=PriceRegressorXGBoostModelHyperparams,
    max_iters=10,
    gpu_mode=False,
)

Eval for PriceRegressorXGBoostModelHyperparams(learning_rate=0.24099746618946757, reg_alpha=1.8343478986616382, reg_lambda=7.796910002727695, max_depth=np.int64(10), n_estimators=np.int64(251), min_child_weight=np.int64(2), gamma=2.296244459829336, subsample=0.666854305569511, colsample_bytree=0.5714334089609704, max_delta_step=6.50888472948853, colsample_bynode=0.5282057895135501, colsample_bylevel=0.8609993861334124, objective='reg:tweedie')
Actual score: RegressionEvaluationResults(mae=7804.54296875, mse=360353696.0, rmse=np.float64(18982.984380755308), r2=0.9306839108467102, mape=np.float64(17.343592465605642))
Best score: RegressionEvaluationResults(mae=7804.54296875, mse=360353696.0, rmse=np.float64(18982.984380755308), r2=0.9306839108467102, mape=np.float64(17.343592465605642))
Best score hyperparams: PriceRegressorXGBoostModelHyperparams(learning_rate=0.24099746618946757, reg_alpha=1.8343478986616382, reg_lambda=7.796910002727695, max_depth=np.int64(10), n_estimators=np.int64(2

None


In [3]:
from utils.export_model import load_model


def cast_regression_to_classification(model_path: str, interval_func):
    model = load_model(model_path)

    data = pd.read_csv("../data/processed_car_sale_ads.csv", low_memory=False)
    data = dtype_mapping(data)
    data.head(10)

    X = data.iloc[:, 1:]
    y = data["Price"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model.fit(X_train, y_train)

    y_pred_regression = model.predict(X_test)

    intervals = generate_price_intervals(y.min(), y.max(), interval_func)

    y_pred_class = classify(pd.Series(y_pred_regression), intervals)
    y_test_class = classify(pd.Series(y_test), intervals)

    y_test_class, intervals = class_reduction(pd.Series(y_test_class), intervals)
    y_pred_class, _ = class_reduction(pd.Series(y_pred_class), intervals)

    metrics = evaluate_classification(
        np.array(y_pred_class), np.array(y_test_class)
    )

    print("Metryki klasyfikacji:", metrics)

In [4]:
interval_func = lambda x: 10 ** (0.4 * x) + 500 * (x + 1) ** 1.2 + 500 * x ** 0.8
cast_regression_to_classification("xgboost_regressor", interval_func)

Metryki klasyfikacji: ClassificationEvaluationResults(accuracy=0.6346764346764346, precision=0.6357625459221087, recall=0.6346764346764346, f1=0.6347910170892275)
