In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
from src.preprocessing import Preprocessor
from sklearn.model_selection import train_test_split, KFold
import pandas as pd
from catboost import CatBoostRegressor
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [7]:
dataset = pd.read_csv("data/pf_suvs_i302_1s2025.csv")

In [8]:
train, test = train_test_split(dataset, test_size=0.2, random_state=42)
val, test = train_test_split(test, test_size=0.5, random_state=42)

## Cross-validation

In [9]:
depths=[4, 6, 8]
learning_rates = [0.1]
iterations = 1500
subsamples = [0.75]
min_data_in_leafs = [10, 100]
colsample_bylevels = [0.75]

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

results = []
total_models = len(depths) * len(learning_rates) * len(subsamples) * len(min_data_in_leafs) * len(colsample_bylevels)
i = 0
for depth in depths:
    for lr in learning_rates:
        for subsample in subsamples:
            for min_data_in_leaf in min_data_in_leafs:
                for colsample_bylevel in colsample_bylevels:
                  fold_scores = []
                  print(f"Modelo {i+1}/{total_models}")
                  i += 1
                  print(f"depth={depth}, learning_rate={lr}, subsample={subsample}, min_data_in_leaf={min_data_in_leaf}, colsample_bylevel={colsample_bylevel}")
                  for train_index, val_index in kf.split(train):
                      train_fold = train.iloc[train_index].copy()
                      val_fold = train.iloc[val_index].copy()
                      preprocessor = Preprocessor(train_fold)
                      train_fold_proc = preprocessor.preprocess(train_fold)
                      val_fold_proc = preprocessor.preprocess(val_fold)
                      X_tr = train_fold_proc.drop(columns='Precio')
                      y_tr = train_fold_proc['Precio']
                      X_val = val_fold_proc.drop(columns='Precio')
                      y_val = val_fold_proc['Precio']
                      model = CatBoostRegressor(
                          depth=depth,
                          learning_rate=lr,
                          iterations=iterations,
                          loss_function='RMSE',
                          eval_metric='RMSE',
                          random_seed=42,
                          verbose=0,
                          text_features=['Título', 'Versión', 'Descripción'],
                          bootstrap_type='Bernoulli',
                          subsample=subsample,
                          min_data_in_leaf=min_data_in_leaf,
                          colsample_bylevel=colsample_bylevel, 
                      )
                      model.fit(
                          X_tr, y_tr,
                          eval_set=(X_val, y_val),
                          early_stopping_rounds=100,
                          use_best_model=True
                      )
                      preds = model.predict(X_val)
                      rmse = mean_squared_error(y_val, preds)
                      rmse = np.sqrt(rmse)
                      fold_scores.append(rmse)
                  print(f"Promedio RMSE: {np.mean(fold_scores)}")
                  avg_rmse = np.mean(fold_scores)
                  results.append({
                      'depth': depth,
                      'learning_rate': lr,
                      'score': avg_rmse
                  })
results.sort(key=lambda x: x['score'])
best = results[0]
print("Mejor configuración:")
print(f"depth={best['depth']}, learning_rate={best['learning_rate']}, RMSE={best['score']:.4f}")

Modelo 1/6
depth=4, learning_rate=0.1, subsample=0.75, min_data_in_leaf=10, colsample_bylevel=0.75
Promedio RMSE: 7307.149304239581
Modelo 2/6
depth=4, learning_rate=0.1, subsample=0.75, min_data_in_leaf=100, colsample_bylevel=0.75
Promedio RMSE: 7307.149304239581
Modelo 3/6
depth=6, learning_rate=0.1, subsample=0.75, min_data_in_leaf=10, colsample_bylevel=0.75
Promedio RMSE: 7066.017924165086
Modelo 4/6
depth=6, learning_rate=0.1, subsample=0.75, min_data_in_leaf=100, colsample_bylevel=0.75
Promedio RMSE: 7066.017924165086
Modelo 5/6
depth=8, learning_rate=0.1, subsample=0.75, min_data_in_leaf=10, colsample_bylevel=0.75
Promedio RMSE: 7023.228135067833
Modelo 6/6
depth=8, learning_rate=0.1, subsample=0.75, min_data_in_leaf=100, colsample_bylevel=0.75
Promedio RMSE: 7023.228135067833
Mejor configuración:
depth=8, learning_rate=0.1, RMSE=7023.2281
