## Imports

In [86]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df.columns = df.columns.str.replace(' ', '_')
df_test.columns = df_test.columns.str.replace(' ', '_')


## Pipelines

In [109]:
impute_onehot = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
impute_onehot.set_output(transform="pandas")
impute_onehot_cols = ['Style', 'Color', 'Brand', 'Material']

size_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Medium')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
]) 
size_pipeline.set_output(transform="pandas")
size_pipeline_cols = ['Size']

yes_no_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
]) 
yes_no_pipeline.set_output(transform="pandas")
yes_no_pipeline_cols = ['Waterproof', 'Laptop_Compartment']

weight_pipeline = Pipeline([
    ('medianimputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())
])
weight_pipeline.set_output(transform="pandas")
weight_pipeline_cols = ['Weight_Capacity_(kg)']

compartments_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=3)), 
    ('scaler', StandardScaler())
])
compartments_pipeline.set_output(transform='pandas')
compartments_pipeline_cols = ['Compartments']


## Column Transformer

In [110]:
preprocessor = ColumnTransformer([
    ('style_color_brand_material', impute_onehot, impute_onehot_cols),
    ('size', size_pipeline, size_pipeline_cols),
    ('waterproof_laptopcompartment', yes_no_pipeline, yes_no_pipeline_cols),
    ('weight_pipeline', weight_pipeline, weight_pipeline_cols),
    ('compartments', compartments_pipeline, compartments_pipeline_cols)
])
preprocessor.set_output(transform='pandas')

## Split

In [114]:
from sklearn.model_selection import train_test_split

id_col = 'id'
target_col = 'Price'
y = df[target_col]
X = df.drop([id_col,target_col],axis=1)

X_transformed = preprocessor.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_transformed, y, test_size=0.2, random_state=13)

# Simple models
### Wybór modeli:
- XGBRegressor
- LightGBM
- XGBoost
- CatBoost
- KNN Imputer (?)


# Prepare Test Data

In [113]:
test_id_col=df_test.id
X_test= df_test.drop('id',axis=1)
X_test_transformed = preprocessor.fit_transform(X_test)

# XGBRegressor

In [96]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=50, max_depth=4, learning_rate=0.05)
xgb.fit(X_train, y_train)

test_predicted = xgb.predict(X_test_transformed)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})
submission.to_csv('submission3.csv', index=False)

# XGBRegressor + Optuna (Basic)

In [98]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators',100,800)
    max_depth=trial.suggest_int('max_depth',3,12)
    learning_rate=trial.suggest_float('learning_rate', 0.01,0.3)
    min_child_weight = trial.suggest_int('min_child_weight', 1,10)
    gamma = trial.suggest_int('gamma', 0, 5)
    subsample = trial.suggest_float('subsample', 0.5,1.0)
    alpha = trial.suggest_int('alpha', 0,10)
    model = XGBRegressor(n_estimators=n_estimators,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         min_child_weight=min_child_weight,
                         gamma=gamma,
                         subsample=subsample,
                         alpha=alpha)
    score = cross_val_score(model, X_train,y_train, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study =optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True )

# [I 2025-02-07 13:51:51,563] Trial 0 finished with value: -39.03383156751836 and parameters: {'n_estimators': 153, 'max_depth': 3, 'learning_rate': 0.06991325421006729}. Best is trial 0 with value: -39.03383156751836.



[I 2025-02-07 18:10:44,684] A new study created in memory with name: no-name-f54edc4c-ef9e-4732-9f34-7eee2bd3a84f


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-02-07 18:10:49,572] Trial 0 finished with value: -39.23496907608175 and parameters: {'n_estimators': 173, 'max_depth': 7, 'learning_rate': 0.1134426252021002, 'min_child_weight': 5, 'gamma': 2, 'subsample': 0.9693246923575476, 'alpha': 0}. Best is trial 0 with value: -39.23496907608175.
[I 2025-02-07 18:11:04,628] Trial 1 finished with value: -40.12614553367782 and parameters: {'n_estimators': 572, 'max_depth': 6, 'learning_rate': 0.2620415522516099, 'min_child_weight': 5, 'gamma': 4, 'subsample': 0.7244127354052011, 'alpha': 7}. Best is trial 0 with value: -39.23496907608175.
[I 2025-02-07 18:11:11,834] Trial 2 finished with value: -39.48850770572005 and parameters: {'n_estimators': 280, 'max_depth': 6, 'learning_rate': 0.2038577039858573, 'min_child_weight': 7, 'gamma': 5, 'subsample': 0.6881516460693603, 'alpha': 3}. Best is trial 0 with value: -39.23496907608175.
[I 2025-02-07 18:11:21,856] Trial 3 finished with value: -39.28970337641586 and parameters: {'n_estimators': 508

In [99]:
xgb = XGBRegressor(n_estimators= 190, max_depth= 3, learning_rate= 0.05905641751878818, min_child_weight= 2, gamma= 0, subsample= 0.9040943570422548, alpha= 2)
xgb.fit(X_train, y_train)
test_predicted = xgb.predict(X_test_transformed)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})
submission.to_csv('submission2.csv', index=False)

# LightGBM

In [None]:
from lightgbm import LGBMRegressor
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators',100,800)
    max_depth=trial.suggest_int('max_depth',3,15)
    num_leaves=trial.suggest_int('num_leaves', 1,100)
    lambda_l1 = trial.suggest_int('lambda_l1',0,5)
    lambda_l2 = trial.suggest_int('lambda_l2',0,10)
    subsample=trial.suggest_float('subsample',0.5,1.0)
    learning_rate=trial.suggest_float('learning_rate',0.001,0.1)
    colsample_bytree=trial.suggest_float('colsample_bytree',0.5,1.0)



    model = LGBMRegressor(n_estimators=n_estimators,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         num_leaves=num_leaves,
                         subsample=subsample,
                         colsample_bytree=colsample_bytree)
    score = cross_val_score(model, X_train,y_train, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study =optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True )
# n_estimators': 240, 'max_depth': 11, 'num_leaves': 6, 'lambda_l1': 1, 'lambda_l2': 5, 'subsample': 0.8568471776574907, 'learning_rate': 0.05564102730320633, 'colsample_bytree': 0.9518026829544922

In [102]:
model = LGBMRegressor(n_estimators=240,
                         max_depth=11,
                         learning_rate=0.0556,
                         num_leaves=6,
                         subsample=0.8568,
                         colsample_bytree=0.9518)
model.fit(X_train,y_train)
model.predict(X_test_transformed)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})
submission.to_csv('submission_lgbm.csv', index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002073 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 319
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 27
[LightGBM] [Info] Start training from score 81.419619


# CatBoost

In [None]:
from catboost import CatBoostRegressor
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators',100,1000)
    depth=trial.suggest_int('depth',4,12)
    l2_leaf_reg=trial.suggest_int('l2_leaf_reg',1,10)
    learning_rate=trial.suggest_float('learning_rate', 0.001,0.3)
    subsample=trial.suggest_float('subsample', 0.5,1.0)
    min_data_in_leaf=trial.suggest_int('min_data_in_leaf',1,50)



    model = CatBoostRegressor(n_estimators=n_estimators,
                         depth=depth,
                         learning_rate=learning_rate,
                         l2_leaf_reg=l2_leaf_reg,
                         subsample=subsample,
                         min_data_in_leaf=min_data_in_leaf)
    score = cross_val_score(model, X_train,y_train, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study =optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True )
# n_estimators': 487, 'depth': 4, 'l2_leaf_reg': 4, 'learning_rate': 0.05228794998549306, 'subsample': 0.510891754041839, 'min_data_in_leaf': 22

In [105]:
model = CatBoostRegressor(n_estimators=487,
                         depth=4,
                         learning_rate=0.05228,
                         l2_leaf_reg=4,
                         subsample=0.51089,
                         min_data_in_leaf=22)
model.fit(X_train,y_train)
model.predict(X_test_transformed)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})
submission.to_csv('submission_catboost.csv', index=False)

0:	learn: 39.0587670	total: 4.54ms	remaining: 2.21s
1:	learn: 39.0570636	total: 9ms	remaining: 2.18s
2:	learn: 39.0558944	total: 14.4ms	remaining: 2.33s
3:	learn: 39.0543419	total: 19.2ms	remaining: 2.31s
4:	learn: 39.0531532	total: 23.6ms	remaining: 2.27s
5:	learn: 39.0521239	total: 28.3ms	remaining: 2.27s
6:	learn: 39.0511513	total: 33.6ms	remaining: 2.31s
7:	learn: 39.0502561	total: 37.9ms	remaining: 2.27s
8:	learn: 39.0494805	total: 42.7ms	remaining: 2.27s
9:	learn: 39.0485617	total: 47.2ms	remaining: 2.25s
10:	learn: 39.0476456	total: 51.8ms	remaining: 2.24s
11:	learn: 39.0468037	total: 56.5ms	remaining: 2.23s
12:	learn: 39.0460798	total: 61.1ms	remaining: 2.23s
13:	learn: 39.0450575	total: 66ms	remaining: 2.23s
14:	learn: 39.0444575	total: 70.4ms	remaining: 2.22s
15:	learn: 39.0438520	total: 74.9ms	remaining: 2.2s
16:	learn: 39.0434616	total: 79.4ms	remaining: 2.19s
17:	learn: 39.0427571	total: 83.6ms	remaining: 2.18s
18:	learn: 39.0420946	total: 88.1ms	remaining: 2.17s
19:	learn

# Better XGB Hparams + median

In [115]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    max_depth=trial.suggest_int('max_depth',3,7)
    learning_rate=trial.suggest_float('learning_rate', 0.01,0.1)
    min_child_weight = trial.suggest_float('min_child_weight', 0.01,1)
    subsample = trial.suggest_float('subsample', 0.1,1.0)
    colsample_bylevel= trial.suggest_float("colsample_bylevel", 0.1, 1)
    colsample_bytree= trial.suggest_float("colsample_bytree", 0.1, 1)
    colsample_bynode= trial.suggest_float("colsample_bynode", 0.1, 1)
    reg_alpha= trial.suggest_float("reg_alpha", 0.01, 1)
    reg_lambda= trial.suggest_float("reg_lambda", 0.01, 1)

    model = XGBRegressor(n_estimators=100,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         min_child_weight=min_child_weight,
                         subsample=subsample,
                         colsample_bylevel=colsample_bylevel,
                         colsample_bynode=colsample_bynode,
                         colsample_bytree=colsample_bytree,
                         reg_alpha=reg_alpha,
                         reg_lambda=reg_lambda)
    score = cross_val_score(model, X_train,y_train, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study =optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True )

[I 2025-02-11 00:39:16,790] A new study created in memory with name: no-name-24a99283-6e38-4eb2-8a19-6ffd2e30e566


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-02-11 00:39:19,686] Trial 0 finished with value: -39.038745243661126 and parameters: {'max_depth': 4, 'learning_rate': 0.09224412259830052, 'min_child_weight': 0.3730596964683165, 'subsample': 0.5005370492614284, 'colsample_bylevel': 0.11880277163075587, 'colsample_bytree': 0.5572255781113449, 'colsample_bynode': 0.8892721164788003, 'reg_alpha': 0.045955873017652624, 'reg_lambda': 0.9882100358967048}. Best is trial 0 with value: -39.038745243661126.
[I 2025-02-11 00:39:21,880] Trial 1 finished with value: -39.041311502368224 and parameters: {'max_depth': 3, 'learning_rate': 0.04689342308148681, 'min_child_weight': 0.8496130726292312, 'subsample': 0.6228409686648572, 'colsample_bylevel': 0.3974406914391523, 'colsample_bytree': 0.38700152124333154, 'colsample_bynode': 0.9624514499401214, 'reg_alpha': 0.28933034407033864, 'reg_lambda': 0.5653685493441296}. Best is trial 0 with value: -39.038745243661126.
[I 2025-02-11 00:39:24,044] Trial 2 finished with value: -39.046402036518714 

In [None]:
from sklearn.metrics import root_mean_squared_error

xgb = XGBRegressor(n_estimators=100,
                         max_depth=4,
                         learning_rate=0.047088991461490495,
                         min_child_weight=0.15280074473565008,
                         subsample=0.8823981209895368,
                         colsample_bylevel=0.38194915435768984,
                         colsample_bynode=0.8920539233861519,
                         colsample_bytree=0.8872938771302274,
                         reg_alpha=0.5677685341501635,
                         reg_lambda=0.519321143067285)
xgb.fit(X_train, y_train)
test_predicted = xgb.predict(X_test_transformed)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})


# submission.to_csv('submission-xgb-weight_median-testing-hparams.csv', index=False)