In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import plot_importance

In [2]:
import sys
import os

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
sys.path.append(parent_dir)

In [3]:
from models.price_evaluator_xgboost_clasifier import (
    PriceClassifierXGBoostModel,
    PriceClassifierXGBoostModelHyperparams,
)
from training.hyperparameter_tuning import hyperparameter_tuning
from utils.classification_range_generator import generate_price_intervals
from utils.classify import classify
from utils.class_reduction import class_reduction
from data_processing.dtype_mapping import dtype_mapping
from utils.export_model import save_model, load_model

In [None]:
data = pd.read_csv("../data/processed_car_sale_ads.csv", low_memory=False)
data = dtype_mapping(data)
data.head(10)

In [None]:
X = data[['Production_year', 'Power_HP', 'Mileage_km', 'Vehicle_brand', 'Vehicle_model', 'Displacement_cm3', 'Vehicle_generation', 'Type', 'Fuel_type', 'Drive']]
y = data["Price"]

intervals = generate_price_intervals(y.min(), y.max(), lambda x: 10 ** (0.4 * x) + 500 * (x + 1) ** 1.2 + 500 * x ** 0.8)
labels = classify(y, intervals)
labels, intervals = class_reduction(labels, intervals)

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)
intervals

In [6]:
param_grid = {
    "learning_rate": [0.1],
    "max_depth": [10],
    "n_estimators": [300],
    "min_child_weight": [6],
    "gamma": [2],
    "subsample": [1.0],
    "colsample_bytree": [0.8],
    "reg_alpha": [2],
    "reg_lambda": [5],
}

In [None]:
best_model = hyperparameter_tuning(
    PriceClassifierXGBoostModel,
    PriceClassifierXGBoostModelHyperparams,
    param_grid,
    X_train,
    y_train,
    X_test,
    y_test,
)

In [None]:
save_model(best_model, "xgboost_classification")

plot_importance(best_model, max_num_features=10, importance_type="weight")

print(best_model)
print(best_model.params)
print(best_model.eval(best_model.predict(X_test), y_test))

importance = best_model.get_booster().get_score(importance_type='weight')
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
print([x[0] for x in sorted_importance])


In [None]:
loaded_model = load_model("xgboost_classification")

print(loaded_model)
print(loaded_model.params)
print(loaded_model.eval(loaded_model.predict(X_test), y_test))