<h2>XGB_OP01 Submission 01 (39.13624)</h2>

In [1]:
import pandas as pd
import numpy as np
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
# Handle missing values
train_df.fillna(train_df.mode().iloc[0], inplace=True)
test_df.fillna(test_df.mode().iloc[0], inplace=True)

In [4]:
# Encode categorical features
encoder = LabelEncoder()
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

for col in categorical_cols:
    train_df[col] = encoder.fit_transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

In [5]:
# Selecting features and target
X = train_df.drop(columns=['id', 'Price'])
y = train_df['Price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Optuna optimization function
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    return np.sqrt(mse)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[I 2025-02-26 21:41:29,780] A new study created in memory with name: no-name-b1689b9e-432d-4c0a-967a-697f192334d4
[I 2025-02-26 21:41:39,135] Trial 0 finished with value: 39.14952148137665 and parameters: {'n_estimators': 493, 'learning_rate': 0.10789640416607911, 'max_depth': 6, 'subsample': 0.549892037593511, 'colsample_bytree': 0.6738772378466602}. Best is trial 0 with value: 39.14952148137665.
[I 2025-02-26 21:41:50,119] Trial 1 finished with value: 38.919568772740924 and parameters: {'n_estimators': 824, 'learning_rate': 0.03021434343321336, 'max_depth': 3, 'subsample': 0.9761122754175016, 'colsample_bytree': 0.9667063596286796}. Best is trial 1 with value: 38.919568772740924.
[I 2025-02-26 21:42:03,736] Trial 2 finished with value: 39.826521953442864 and parameters: {'n_estimators': 712, 'learning_rate': 0.29642275051880757, 'max_depth': 6, 'subsample': 0.5034110893107357, 'colsample_bytree': 0.6070878892067743}. Best is trial 1 with value: 38.919568772740924.
[I 2025-02-26 21:42

In [7]:
# Train best model
best_params = study.best_params
model = xgb.XGBRegressor(**best_params)
model.fit(X_train, y_train)

In [8]:
# Evaluate model
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 38.91568463999751


In [9]:
# Predict on test data
test_X = test_df.drop(columns=['id'])
test_preds = model.predict(test_X)

In [10]:
# Prepare submission file
submission = pd.DataFrame({'id': test_df['id'], 'Price': test_preds})
submission.to_csv("XGB_OP01.csv", index=False)

print("Submission file created successfully.")

Submission file created successfully.
