## Imports

In [55]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

df = pd.read_csv('train.csv')
df = df.dropna(subset=['Style', 'Color', 'Brand', 'Material'])
df_test = pd.read_csv('test.csv')
df.columns = df.columns.str.replace(' ', '_')
df_test.columns = df_test.columns.str.replace(' ', '_')


In [56]:
print(df.isna().sum())
print(df.shape)

id                         0
Brand                      0
Material                   0
Size                    6101
Compartments               0
Laptop_Compartment      6722
Waterproof              6510
Style                      0
Color                      0
Weight_Capacity_(kg)      48
Price                      0
dtype: int64
(265771, 11)


## Pipelines

In [57]:
impute_onehot = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
impute_onehot.set_output(transform="pandas")
impute_onehot_cols = ['Style', 'Color', 'Brand', 'Material']

size_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Medium')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
]) 
size_pipeline.set_output(transform="pandas")
size_pipeline_cols = ['Size']

yes_no_pipeline = Pipeline([
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
]) 
yes_no_pipeline.set_output(transform="pandas")
yes_no_pipeline_cols = ['Waterproof', 'Laptop_Compartment']

weight_pipeline = Pipeline([
    ('medianimputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
weight_pipeline.set_output(transform="pandas")
weight_pipeline_cols = ['Weight_Capacity_(kg)']

compartments_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=3)), 
    ('scaler', StandardScaler())
])
compartments_pipeline.set_output(transform='pandas')
compartments_pipeline_cols = ['Compartments']


## Column Transformer

In [58]:
preprocessor = ColumnTransformer([
    ('style_color_brand_material', impute_onehot, impute_onehot_cols),
    ('size', size_pipeline, size_pipeline_cols),
    ('waterproof_laptopcompartment', yes_no_pipeline, yes_no_pipeline_cols),
    ('weight_pipeline', weight_pipeline, weight_pipeline_cols),
    ('compartments', compartments_pipeline, compartments_pipeline_cols)
])
preprocessor.set_output(transform='pandas')

## Split

In [59]:
from sklearn.model_selection import train_test_split

id_col = 'id'
target_col = 'Price'
y = df[target_col]
X = df.drop([id_col,target_col],axis=1)

X_transformed = preprocessor.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_transformed, y, test_size=0.2, random_state=13)

# Simple models
### Wybór modeli:
- XGBRegressor
- LightGBM
- XGBoost
- CatBoost
- KNN Imputer (?)


# Prepare Test Data

In [60]:
test_id_col=df_test.id
X_test= df_test.drop('id',axis=1)
X_test_transformed = preprocessor.fit_transform(X_test)

# XGBRegressor + Optuna (Basic)

In [33]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    max_depth=trial.suggest_int('max_depth',3,7)
    learning_rate=trial.suggest_float('learning_rate', 0.01,0.1)
    min_child_weight = trial.suggest_float('min_child_weight', 0.01,1)
    subsample = trial.suggest_float('subsample', 0.1,1.0)
    colsample_bylevel= trial.suggest_float("colsample_bylevel", 0.1, 1)
    colsample_bytree= trial.suggest_float("colsample_bytree", 0.1, 1)
    colsample_bynode= trial.suggest_float("colsample_bynode", 0.1, 1)
    reg_alpha= trial.suggest_float("reg_alpha", 0.01, 1)
    reg_lambda= trial.suggest_float("reg_lambda", 0.01, 1)

    model = XGBRegressor(n_estimators=100,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         min_child_weight=min_child_weight,
                         subsample=subsample,
                         colsample_bylevel=colsample_bylevel,
                         colsample_bynode=colsample_bynode,
                         colsample_bytree=colsample_bytree,
                         reg_alpha=reg_alpha,
                         reg_lambda=reg_lambda)
    score = cross_val_score(model, X_train,y_train, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study =optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True )

# [I 2025-02-11 00:06:14,659] Trial 23 finished with value: -39.0235015574814 and parameters: {'max_depth': 4, 'learning_rate': 0.047088991461490495, 'min_child_weight': 0.15280074473565008, 'subsample': 0.8823981209895368, 'colsample_bylevel': 0.38194915435768984, 'colsample_bytree': 0.8872938771302274, 'colsample_bynode': 0.8920539233861519, 'reg_alpha': 0.5677685341501635, 'reg_lambda': 0.519321143067285}. Best is trial 23 with value: -39.0235015574814.



[I 2025-02-11 00:18:03,831] A new study created in memory with name: no-name-f536a329-e97a-4d2a-b4ae-c75f120062d0


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-02-11 00:18:06,180] Trial 0 finished with value: -38.93352499681244 and parameters: {'max_depth': 4, 'learning_rate': 0.09738165129336883, 'min_child_weight': 0.2953756913063758, 'subsample': 0.7582721732175878, 'colsample_bylevel': 0.9834579108629342, 'colsample_bytree': 0.7699614459671464, 'colsample_bynode': 0.9782418105940006, 'reg_alpha': 0.9574220931075091, 'reg_lambda': 0.30182695674706067}. Best is trial 0 with value: -38.93352499681244.
[I 2025-02-11 00:18:08,246] Trial 1 finished with value: -38.924711406752 and parameters: {'max_depth': 3, 'learning_rate': 0.03299787637727764, 'min_child_weight': 0.7797357630455212, 'subsample': 0.5710795201240312, 'colsample_bylevel': 0.9271931247022445, 'colsample_bytree': 0.9842634846356743, 'colsample_bynode': 0.2911023112880643, 'reg_alpha': 0.06746931789427571, 'reg_lambda': 0.9619605957355585}. Best is trial 1 with value: -38.924711406752.
[I 2025-02-11 00:18:10,524] Trial 2 finished with value: -38.92962777603618 and paramete

In [61]:
xgb = XGBRegressor(n_estimators=100,
                         max_depth=4,
                         learning_rate=0.047088991461490495,
                         min_child_weight=0.15280074473565008,
                         subsample=0.8823981209895368,
                         colsample_bylevel=0.38194915435768984,
                         colsample_bynode=0.8920539233861519,
                         colsample_bytree=0.8872938771302274,
                         reg_alpha=0.5677685341501635,
                         reg_lambda=0.519321143067285)
xgb.fit(X_train, y_train)
test_predicted = xgb.predict(X_test_transformed)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})
submission.to_csv('submission-xgb-weight_median-testing-hparams.csv', index=False)

ValueError: feature_names mismatch: ['style_color_brand_material__Style_Backpack', 'style_color_brand_material__Style_Messenger', 'style_color_brand_material__Style_Tote', 'style_color_brand_material__Color_Black', 'style_color_brand_material__Color_Blue', 'style_color_brand_material__Color_Gray', 'style_color_brand_material__Color_Green', 'style_color_brand_material__Color_Pink', 'style_color_brand_material__Color_Red', 'style_color_brand_material__Brand_Adidas', 'style_color_brand_material__Brand_Jansport', 'style_color_brand_material__Brand_Nike', 'style_color_brand_material__Brand_Puma', 'style_color_brand_material__Brand_Under Armour', 'style_color_brand_material__Material_Canvas', 'style_color_brand_material__Material_Leather', 'style_color_brand_material__Material_Nylon', 'style_color_brand_material__Material_Polyester', 'size__Size', 'waterproof_laptopcompartment__Waterproof', 'waterproof_laptopcompartment__Laptop_Compartment', 'weight_pipeline__Weight_Capacity_(kg)', 'compartments__Compartments'] ['style_color_brand_material__Style_Backpack', 'style_color_brand_material__Style_Messenger', 'style_color_brand_material__Style_Tote', 'style_color_brand_material__Style_nan', 'style_color_brand_material__Color_Black', 'style_color_brand_material__Color_Blue', 'style_color_brand_material__Color_Gray', 'style_color_brand_material__Color_Green', 'style_color_brand_material__Color_Pink', 'style_color_brand_material__Color_Red', 'style_color_brand_material__Color_nan', 'style_color_brand_material__Brand_Adidas', 'style_color_brand_material__Brand_Jansport', 'style_color_brand_material__Brand_Nike', 'style_color_brand_material__Brand_Puma', 'style_color_brand_material__Brand_Under Armour', 'style_color_brand_material__Brand_nan', 'style_color_brand_material__Material_Canvas', 'style_color_brand_material__Material_Leather', 'style_color_brand_material__Material_Nylon', 'style_color_brand_material__Material_Polyester', 'style_color_brand_material__Material_nan', 'size__Size', 'waterproof_laptopcompartment__Waterproof', 'waterproof_laptopcompartment__Laptop_Compartment', 'weight_pipeline__Weight_Capacity_(kg)', 'compartments__Compartments']
training data did not have the following fields: style_color_brand_material__Style_nan, style_color_brand_material__Color_nan, style_color_brand_material__Brand_nan, style_color_brand_material__Material_nan

# LightGBM

In [19]:
from lightgbm import LGBMRegressor
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators',100,800)
    max_depth=trial.suggest_int('max_depth',3,15)
    num_leaves=trial.suggest_int('num_leaves', 2,100)
    subsample=trial.suggest_float('subsample',0.5,1.0)
    learning_rate=trial.suggest_float('learning_rate',0.001,0.1)
    colsample_bytree=trial.suggest_float('colsample_bytree',0.5,1.0)



    model = LGBMRegressor(n_estimators=n_estimators,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         num_leaves=num_leaves,
                         subsample=subsample,
                         colsample_bytree=colsample_bytree)
    score = cross_val_score(model, X_train,y_train, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study =optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40, show_progress_bar=True )
# 'n_estimators': 356, 'max_depth': 10, 'num_leaves': 7, 'subsample': 0.7426142421600143, 'learning_rate': 0.02684749252385173, 'colsample_bytree': 0.655631790885152}. Best is trial 39 with value: -39.01980710390345.

[I 2025-02-10 23:29:47,626] A new study created in memory with name: no-name-314eff12-7d27-48fa-8f0e-4bfdb3cf1e2a


  0%|          | 0/40 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 192000, number of used features: 27
[LightGBM] [Info] Start training from score 81.378871
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 192000, number of used features: 27
[LightGBM] [Info] Start training from score 81.491740
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [20]:
model = LGBMRegressor(n_estimators=356,
                         max_depth=10,
                         learning_rate=0.02684749252385173,
                         num_leaves=7,
                         subsample=0.7426142421600143,
                         colsample_bytree=0.655631790885152)
model.fit(X_train,y_train)
model.predict(X_test_transformed)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})
submission.to_csv('submission_lgbm_deletednulls.csv', index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 27
[LightGBM] [Info] Start training from score 81.419619


# CatBoost

In [None]:
from catboost import CatBoostRegressor
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators',100,1000)
    depth=trial.suggest_int('depth',4,12)
    l2_leaf_reg=trial.suggest_int('l2_leaf_reg',1,10)
    learning_rate=trial.suggest_float('learning_rate', 0.001,0.3)
    subsample=trial.suggest_float('subsample', 0.5,1.0)
    min_data_in_leaf=trial.suggest_int('min_data_in_leaf',1,50)



    model = CatBoostRegressor(n_estimators=n_estimators,
                         depth=depth,
                         learning_rate=learning_rate,
                         l2_leaf_reg=l2_leaf_reg,
                         subsample=subsample,
                         min_data_in_leaf=min_data_in_leaf)
    score = cross_val_score(model, X_train,y_train, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study =optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=60, show_progress_bar=True )
# Trial 54 finished with value: -39.01813791433403 and parameters: {'n_estimators': 159, 'depth': 4, 'l2_leaf_reg': 3, 'learning_rate': 0.10037409967024077, 'subsample': 0.9759101072912689, 'min_data_in_leaf': 44}. Best is trial 54 with value: -39.01813791433403.

In [23]:
model = CatBoostRegressor(n_estimators=159,
                         depth=4,
                         learning_rate=0.10037409967024077,
                         l2_leaf_reg=3,
                         subsample=0.9759101072912689,
                         min_data_in_leaf=44)
model.fit(X_train,y_train)
model.predict(X_test_transformed)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})
submission.to_csv('submission_catboost_nulldropping.csv', index=False)

0:	learn: 39.0558433	total: 7.61ms	remaining: 1.2s
1:	learn: 39.0519520	total: 14.3ms	remaining: 1.12s
2:	learn: 39.0491271	total: 20ms	remaining: 1.04s
3:	learn: 39.0459779	total: 26ms	remaining: 1s
4:	learn: 39.0426335	total: 32.3ms	remaining: 996ms
5:	learn: 39.0400634	total: 38.2ms	remaining: 975ms
6:	learn: 39.0376313	total: 43.9ms	remaining: 953ms
7:	learn: 39.0361997	total: 49.7ms	remaining: 938ms
8:	learn: 39.0341793	total: 55.1ms	remaining: 919ms
9:	learn: 39.0328555	total: 60.9ms	remaining: 907ms
10:	learn: 39.0316032	total: 66.8ms	remaining: 898ms
11:	learn: 39.0306221	total: 72.6ms	remaining: 890ms
12:	learn: 39.0294985	total: 78.4ms	remaining: 880ms
13:	learn: 39.0282743	total: 83.8ms	remaining: 868ms
14:	learn: 39.0272559	total: 89.3ms	remaining: 857ms
15:	learn: 39.0262282	total: 95.8ms	remaining: 856ms
16:	learn: 39.0253401	total: 102ms	remaining: 851ms
17:	learn: 39.0249375	total: 107ms	remaining: 841ms
18:	learn: 39.0239069	total: 113ms	remaining: 835ms
19:	learn: 39.