In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from catboost import CatBoostRegressor

import pandas as pd
import optuna

In [10]:
df = pd.read_csv('train_preprocessed.csv', index_col=False)

In [11]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

y_train = train_set['price']
y_test = test_set['price'] 

X_train = train_set.drop('price', axis=1)
X_test = test_set.drop('price', axis=1)

In [12]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 3000, 12000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 500),
        "max_bin": trial.suggest_int("max_bin", 1, 200),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        "bagging_temperature": trial.suggest_int("bagging_temperature", 1, 20),
        "rsm": trial.suggest_float("rsm", 0.1, 1.0)
    }

    model = CatBoostRegressor(**params, silent=True, loss_function="MAE", boost_from_average=True)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    return mae

In [13]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

[I 2024-03-15 09:38:40,844] A new study created in memory with name: no-name-86261941-c161-45dc-a173-846d75dd0291


[I 2024-03-15 09:39:36,075] Trial 0 finished with value: 138939.70111483155 and parameters: {'iterations': 4151, 'learning_rate': 0.0011573083735205115, 'depth': 8, 'subsample': 0.3438912300284733, 'min_data_in_leaf': 163, 'max_bin': 154, 'l2_leaf_reg': 5.282326711543188, 'bagging_temperature': 9, 'rsm': 0.2407146632901481}. Best is trial 0 with value: 138939.70111483155.
[I 2024-03-15 09:39:59,889] Trial 1 finished with value: 134522.15649909247 and parameters: {'iterations': 3114, 'learning_rate': 0.003650831346965866, 'depth': 5, 'subsample': 0.3335760630959645, 'min_data_in_leaf': 150, 'max_bin': 186, 'l2_leaf_reg': 6.6121072716092355, 'bagging_temperature': 1, 'rsm': 0.7149959424400893}. Best is trial 1 with value: 134522.15649909247.
[I 2024-03-15 09:42:11,475] Trial 2 finished with value: 96034.58866704766 and parameters: {'iterations': 10492, 'learning_rate': 0.015365178995429064, 'depth': 7, 'subsample': 0.900450746109598, 'min_data_in_leaf': 214, 'max_bin': 72, 'l2_leaf_reg':

In [23]:
params = study.best_params

In [24]:
model = CatBoostRegressor(**params, silent=True, loss_function='MAE', boost_from_average=True)

model.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x15fe36bbc20>

In [25]:
pred = model.predict(X_test)

mean_absolute_error(y_test, pred)

95528.47081994236

In [26]:
df = pd.read_csv('train_preprocessed.csv', index_col=False)
y = df['price']
df.drop(['price'], inplace=True, axis=1)
X = df

In [27]:
model = CatBoostRegressor(**params, silent=True, loss_function='MAE', boost_from_average=True)

model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x15fd2a8cfe0>

In [28]:
test = pd.read_csv('test_preprocessed.csv', index_col=False)
sub = pd.DataFrame({'id': test.index})
test.drop(['id'], inplace=True, axis=1)

In [29]:
preds = model.predict(test)
sub['price'] = preds
sub.to_csv('submission.csv', index=False)