## Imports

In [9]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

df = pd.read_csv('train.csv')
df=df.dropna()
df_test = pd.read_csv('test.csv')
df.columns = df.columns.str.replace(' ', '_')
df_test.columns = df_test.columns.str.replace(' ', '_')


In [13]:
print(df.isna().sum())
print(df.shape)

id                      0
Brand                   0
Material                0
Size                    0
Compartments            0
Laptop_Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight_Capacity_(kg)    0
Price                   0
dtype: int64
(246686, 11)


## Pipelines

In [3]:
impute_onehot = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
impute_onehot.set_output(transform="pandas")
impute_onehot_cols = ['Style', 'Color', 'Brand', 'Material']

size_pipeline = Pipeline([
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
]) 
size_pipeline.set_output(transform="pandas")
size_pipeline_cols = ['Size']

yes_no_pipeline = Pipeline([
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
]) 
yes_no_pipeline.set_output(transform="pandas")
yes_no_pipeline_cols = ['Waterproof', 'Laptop_Compartment']

weight_pipeline = Pipeline([
    ('scaler', StandardScaler())
])
weight_pipeline.set_output(transform="pandas")
weight_pipeline_cols = ['Weight_Capacity_(kg)']

compartments_pipeline = Pipeline([
    ('scaler', StandardScaler())
])
compartments_pipeline.set_output(transform='pandas')
compartments_pipeline_cols = ['Compartments']


## Column Transformer

In [4]:
preprocessor = ColumnTransformer([
    ('style_color_brand_material', impute_onehot, impute_onehot_cols),
    ('size', size_pipeline, size_pipeline_cols),
    ('waterproof_laptopcompartment', yes_no_pipeline, yes_no_pipeline_cols),
    ('weight_pipeline', weight_pipeline, weight_pipeline_cols),
    ('compartments', compartments_pipeline, compartments_pipeline_cols)
])
preprocessor.set_output(transform='pandas')

## Split

In [5]:
from sklearn.model_selection import train_test_split

id_col = 'id'
target_col = 'Price'
y = df[target_col]
X = df.drop([id_col,target_col],axis=1)

X_transformed = preprocessor.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_transformed, y, test_size=0.2, random_state=13)

# Simple models
### Wybór modeli:
- XGBRegressor
- LightGBM
- XGBoost
- CatBoost
- KNN Imputer (?)


# Prepare Test Data

In [6]:
test_id_col=df_test.id
X_test= df_test.drop('id',axis=1)
X_test_transformed = preprocessor.fit_transform(X_test)

# XGBRegressor + Optuna (Basic)

In [None]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators',100,800)
    max_depth=trial.suggest_int('max_depth',3,12)
    learning_rate=trial.suggest_float('learning_rate', 0.01,0.3)
    min_child_weight = trial.suggest_int('min_child_weight', 1,10)
    gamma = trial.suggest_int('gamma', 0, 5)
    subsample = trial.suggest_float('subsample', 0.5,1.0)
    alpha = trial.suggest_int('alpha', 0,10)
    model = XGBRegressor(n_estimators=n_estimators,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         min_child_weight=min_child_weight,
                         gamma=gamma,
                         subsample=subsample,
                         alpha=alpha)
    score = cross_val_score(model, X_train,y_train, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study =optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True )

# Trial 25 finished with value: -39.02103061463565 and parameters: {'n_estimators': 307, 'max_depth': 3, 'learning_rate': 0.027818281440946092, 'min_child_weight': 6, 'gamma': 2, 'subsample': 0.7384478387420391, 'alpha': 3}. Best is trial 25 with value: -39.02103061463565.


In [15]:
xgb = XGBRegressor(n_estimators= 307, max_depth= 3, learning_rate= 0.027818281440946092, min_child_weight= 6, gamma= 2, subsample= 0.7384478387420391, alpha= 3)
xgb.fit(X_train, y_train)
test_predicted = xgb.predict(X_test_transformed)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})
submission.to_csv('submission-xgb-deletenulls.csv', index=False)

# LightGBM

In [19]:
from lightgbm import LGBMRegressor
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators',100,800)
    max_depth=trial.suggest_int('max_depth',3,15)
    num_leaves=trial.suggest_int('num_leaves', 2,100)
    subsample=trial.suggest_float('subsample',0.5,1.0)
    learning_rate=trial.suggest_float('learning_rate',0.001,0.1)
    colsample_bytree=trial.suggest_float('colsample_bytree',0.5,1.0)



    model = LGBMRegressor(n_estimators=n_estimators,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         num_leaves=num_leaves,
                         subsample=subsample,
                         colsample_bytree=colsample_bytree)
    score = cross_val_score(model, X_train,y_train, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study =optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40, show_progress_bar=True )
# 'n_estimators': 356, 'max_depth': 10, 'num_leaves': 7, 'subsample': 0.7426142421600143, 'learning_rate': 0.02684749252385173, 'colsample_bytree': 0.655631790885152}. Best is trial 39 with value: -39.01980710390345.

[I 2025-02-10 23:29:47,626] A new study created in memory with name: no-name-314eff12-7d27-48fa-8f0e-4bfdb3cf1e2a


  0%|          | 0/40 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 192000, number of used features: 27
[LightGBM] [Info] Start training from score 81.378871
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 192000, number of used features: 27
[LightGBM] [Info] Start training from score 81.491740
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [20]:
model = LGBMRegressor(n_estimators=356,
                         max_depth=10,
                         learning_rate=0.02684749252385173,
                         num_leaves=7,
                         subsample=0.7426142421600143,
                         colsample_bytree=0.655631790885152)
model.fit(X_train,y_train)
model.predict(X_test_transformed)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})
submission.to_csv('submission_lgbm_deletednulls.csv', index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 27
[LightGBM] [Info] Start training from score 81.419619


# CatBoost

In [21]:
from catboost import CatBoostRegressor
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators',100,1000)
    depth=trial.suggest_int('depth',4,12)
    l2_leaf_reg=trial.suggest_int('l2_leaf_reg',1,10)
    learning_rate=trial.suggest_float('learning_rate', 0.001,0.3)
    subsample=trial.suggest_float('subsample', 0.5,1.0)
    min_data_in_leaf=trial.suggest_int('min_data_in_leaf',1,50)



    model = CatBoostRegressor(n_estimators=n_estimators,
                         depth=depth,
                         learning_rate=learning_rate,
                         l2_leaf_reg=l2_leaf_reg,
                         subsample=subsample,
                         min_data_in_leaf=min_data_in_leaf)
    score = cross_val_score(model, X_train,y_train, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study =optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=60, show_progress_bar=True )
# n_estimators': 487, 'depth': 4, 'l2_leaf_reg': 4, 'learning_rate': 0.05228794998549306, 'subsample': 0.510891754041839, 'min_data_in_leaf': 22

[I 2025-02-10 23:37:06,413] A new study created in memory with name: no-name-d7f35a79-fc70-4e64-a4fb-7cc6c7e19b2d


  0%|          | 0/60 [00:00<?, ?it/s]

0:	learn: 39.0356310	total: 12.2ms	remaining: 3.19s
1:	learn: 39.0258007	total: 25.6ms	remaining: 3.33s
2:	learn: 39.0146605	total: 38.2ms	remaining: 3.29s
3:	learn: 39.0061509	total: 50.2ms	remaining: 3.24s
4:	learn: 38.9960321	total: 62.7ms	remaining: 3.22s
5:	learn: 38.9879645	total: 75ms	remaining: 3.2s
6:	learn: 38.9791668	total: 87ms	remaining: 3.17s
7:	learn: 38.9696825	total: 98.7ms	remaining: 3.13s
8:	learn: 38.9582612	total: 110ms	remaining: 3.1s
9:	learn: 38.9519462	total: 122ms	remaining: 3.09s
10:	learn: 38.9459036	total: 135ms	remaining: 3.09s
11:	learn: 38.9364107	total: 147ms	remaining: 3.06s
12:	learn: 38.9274137	total: 159ms	remaining: 3.05s
13:	learn: 38.9160884	total: 171ms	remaining: 3.02s
14:	learn: 38.9044351	total: 182ms	remaining: 3s
15:	learn: 38.8979017	total: 194ms	remaining: 2.99s
16:	learn: 38.8900387	total: 206ms	remaining: 2.98s
17:	learn: 38.8840245	total: 218ms	remaining: 2.95s
18:	learn: 38.8728625	total: 231ms	remaining: 2.95s
19:	learn: 38.8663088	t

In [None]:
model = CatBoostRegressor(n_estimators=487,
                         depth=4,
                         learning_rate=0.05228,
                         l2_leaf_reg=4,
                         subsample=0.51089,
                         min_data_in_leaf=22)
model.fit(X_train,y_train)
model.predict(X_test_transformed)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})
submission.to_csv('submission_catboost.csv', index=False)

0:	learn: 39.0587670	total: 4.54ms	remaining: 2.21s
1:	learn: 39.0570636	total: 9ms	remaining: 2.18s
2:	learn: 39.0558944	total: 14.4ms	remaining: 2.33s
3:	learn: 39.0543419	total: 19.2ms	remaining: 2.31s
4:	learn: 39.0531532	total: 23.6ms	remaining: 2.27s
5:	learn: 39.0521239	total: 28.3ms	remaining: 2.27s
6:	learn: 39.0511513	total: 33.6ms	remaining: 2.31s
7:	learn: 39.0502561	total: 37.9ms	remaining: 2.27s
8:	learn: 39.0494805	total: 42.7ms	remaining: 2.27s
9:	learn: 39.0485617	total: 47.2ms	remaining: 2.25s
10:	learn: 39.0476456	total: 51.8ms	remaining: 2.24s
11:	learn: 39.0468037	total: 56.5ms	remaining: 2.23s
12:	learn: 39.0460798	total: 61.1ms	remaining: 2.23s
13:	learn: 39.0450575	total: 66ms	remaining: 2.23s
14:	learn: 39.0444575	total: 70.4ms	remaining: 2.22s
15:	learn: 39.0438520	total: 74.9ms	remaining: 2.2s
16:	learn: 39.0434616	total: 79.4ms	remaining: 2.19s
17:	learn: 39.0427571	total: 83.6ms	remaining: 2.18s
18:	learn: 39.0420946	total: 88.1ms	remaining: 2.17s
19:	learn