In [None]:
from catboost import CatBoostRegressor
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np
import os

In [None]:
#define paths and names
def get_datapaths(basedir):
    this_path = os.getcwd()
    datadir = this_path.split(basedir)[0] + basedir + '/data'
    datadir_in = datadir + '/tvt'
    datadir_raw = datadir + '/raw'
    processed_name = datadir + '/processed_data'
    return {'tvt': datadir_in, 'raw': datadir_raw, 'processed': processed_name}

In [None]:
splits = ['train', 'val']
target = 'SalePrice'
data_paths = get_datapaths('HousePrices')
data = [pd.read_parquet(data_paths['processed'] + '/' + split +".parquet", engine="pyarrow") for split in splits]
X_train, X_val = [d.drop(columns=target) for d in data]
y_train, y_val = [np.log(d[target]) for d in data]

In [None]:
print(y_train)

In [None]:
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    l2_leaf_reg=3,
    bagging_temperature=1.0,
    random_seed=42,
    task_type="CPU",         # or "GPU" if you have one
    od_type="Iter",          # “Iter” means use early stopping
    od_wait=50               # stop after 50 rounds without improvement on val
)

In [None]:
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True,
    verbose=100
)

In [None]:
# After fitting, you can check metrics on validation:
val_preds = model.predict(X_val)
rmse = mean_squared_error(y_val, val_preds)
print(f"Validation RMSE: {rmse:.4f}")


In [None]:
print(val_preds)
print(y_val)