In [21]:
from catboost import CatBoostRegressor
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np
import os

In [22]:
#define paths and names
def get_datapaths(basedir):
    this_path = os.getcwd()
    datadir = this_path.split(basedir)[0] + basedir + '/data'
    datadir_in = datadir + '/tvt'
    datadir_raw = datadir + '/raw'
    processed_name = datadir + '/processed_data'
    return {'tvt': datadir_in, 'raw': datadir_raw, 'processed': processed_name}

In [23]:
splits = ['train', 'val']
target = 'SalePrice'
data_paths = get_datapaths('HousePrices')
data = [pd.read_parquet(data_paths['processed'] + '/' + split +".parquet", engine="pyarrow") for split in splits]
X_train, X_val = [d.drop(columns=target) for d in data]
y_train, y_val = [np.log(d[target]) for d in data]

In [24]:
print(y_train)

0       11.877569
1       11.407565
2       11.736069
3       11.813030
4       12.240474
          ...    
1089    12.588191
1090    11.728037
1091    11.571194
1092    12.206073
1093    12.100712
Name: SalePrice, Length: 1094, dtype: float64


In [25]:
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    l2_leaf_reg=3,
    bagging_temperature=1.0,
    random_seed=42,
    task_type="CPU",         # or "GPU" if you have one
    od_type="Iter",          # “Iter” means use early stopping
    od_wait=50               # stop after 50 rounds without improvement on val
)

In [26]:
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True,
    verbose=100
)

0:	learn: 0.3705620	test: 0.3868023	best: 0.3868023 (0)	total: 3.1ms	remaining: 3.09s
100:	learn: 0.0923665	test: 0.1330684	best: 0.1330684 (100)	total: 218ms	remaining: 1.94s
200:	learn: 0.0660517	test: 0.1293761	best: 0.1293761 (200)	total: 415ms	remaining: 1.65s
300:	learn: 0.0483035	test: 0.1276026	best: 0.1275167 (291)	total: 655ms	remaining: 1.52s
400:	learn: 0.0361145	test: 0.1269715	best: 0.1269619 (382)	total: 885ms	remaining: 1.32s
500:	learn: 0.0279780	test: 0.1267043	best: 0.1267021 (499)	total: 1.13s	remaining: 1.12s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1265635717
bestIteration = 533

Shrink model to first 534 iterations.


<catboost.core.CatBoostRegressor at 0x163e4a010>

In [27]:
# After fitting, you can check metrics on validation:
val_preds = model.predict(X_val)
rmse = mean_squared_error(y_val, val_preds)
print(f"Validation RMSE: {rmse:.4f}")


Validation RMSE: 0.0160


In [20]:
print(val_preds)
print(y_val)

[12.57545082 11.89048661 11.67039884 11.75952663 12.0589625  11.4002455
 11.81118915 11.41402298 12.34863693 11.95306259 12.23985392 12.58763286
 12.05291662 11.76845628 12.28853801 12.48095266 11.30353885 12.08819921
 12.09912907 11.43763962 11.75283646 12.40328059 11.98210298 12.14238176
 11.99316821 12.14253151 11.85141933 12.12415769 11.87043567 12.16346359
 11.8730629  11.76546588 11.49080804 12.08355471 12.11640484 12.70130837
 11.59967336 11.85532376 11.79110122 11.65309531 12.00992417 11.88687732
 12.22007424 11.4054544  11.90139496 12.6212294  12.00878195 11.81664532
 12.08217472 12.18822156 12.29307771 11.66267816 11.98613277 12.98520409
 12.2206505  11.86777887 12.28121613 12.23112257 11.9907558  12.91219241
 11.21664825 12.10299386 11.48732836 11.91093873 12.52320364 11.34149721
 12.4032251  12.17092499 11.76044497 11.65899875 11.74817648 12.16259635
 12.2934628  11.86516419 11.85576189 12.27923083 11.03960178 11.9626505
 12.18142    12.67208212 11.85129752 12.18090247 11.5