# Imports

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('train.csv')
df.columns = df.columns.str.replace(' ', '_')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 290295 non-null  object 
 2   Material              291653 non-null  object 
 3   Size                  293405 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop_Compartment    292556 non-null  object 
 6   Waterproof            292950 non-null  object 
 7   Style                 292030 non-null  object 
 8   Color                 290050 non-null  object 
 9   Weight_Capacity_(kg)  299862 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


# Preprocessing function

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [5]:
cat_cols=['Brand', 'Material','Size', 'Laptop_Compartment','Waterproof','Style','Color']

def preprocess(df):
    data_processed = df.copy()

    weight_imputer=SimpleImputer(strategy='median')
    weight_imputer.fit(df[['Weight_Capacity_(kg)']])

    frequent_imputer = SimpleImputer(strategy='most_frequent')
    frequent_imputer.fit(df[cat_cols])

    average_imputer=SimpleImputer(strategy='mean')
    average_imputer.fit(df[['Compartments']])


    data_processed[['Weight_Capacity_(kg)']]=weight_imputer.transform(data_processed[['Weight_Capacity_(kg)']])
    data_processed[cat_cols]=frequent_imputer.transform(data_processed[cat_cols])
    data_processed[['Compartments']] = average_imputer.transform(data_processed[['Compartments']])

    data_processed = pd.get_dummies(data_processed, columns=cat_cols, drop_first=False, dtype=int).copy()

    weight_scaler = StandardScaler()
    weight_scaler.fit(df[['Weight_Capacity_(kg)']])
    data_processed[['Weight_Capacity_(kg)']] = weight_scaler.transform(data_processed[['Weight_Capacity_(kg)']])
    
    return data_processed

# Preprocess and split

In [6]:
processed_train = preprocess(df)
Y = processed_train['Price']
X = processed_train.drop(columns=['id', 'Price'], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)

# Models

In [7]:
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error
import optuna

def objective_xgboost(trial):
    params = {
        "n_estimators": 70,
        "eval_metric": "rmse",
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.1),
        "min_child_weight": trial.suggest_int("min_child_weight", 0.01, 1),
        "subsample": trial.suggest_loguniform("subsample", 0.1, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.1, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 1)
    }

    xgb=XGBRegressor(**params, enable_categorical=True)
    xgb.fit(X_train,Y_train)
    Y_pred = xgb.predict(X_test)

    return root_mean_squared_error(Y_test, Y_pred)

In [None]:
study_xgb = optuna.create_study(direction='minimize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_xgb.optimize(objective_xgboost, n_trials=50, show_progress_bar=True)
best_params_xgb = study_xgb.best_params

# Sub

In [14]:
df_test = pd.read_csv('test.csv')
df_test.columns = df_test.columns.str.replace(' ', '_')
model_xgb = XGBRegressor(**best_params_xgb)
model_xgb.fit(X_train, Y_train)

test_id_col = df_test.id
df_test = df_test.drop('id',axis=1)
df_test_preproc = preprocess(df_test)


test_predicted = model_xgb.predict(df_test_preproc)
submission = pd.DataFrame({
    'id': test_id_col,
    'Price': test_predicted
})
submission.to_csv('submission-without-pipelines.csv', index=False)