In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e2/sample_submission.csv
/kaggle/input/playground-series-s5e2/train.csv
/kaggle/input/playground-series-s5e2/test.csv
/kaggle/input/playground-series-s5e2/training_extra.csv


In [2]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU")


GPU is available: Tesla P100-PCIE-16GB


In [3]:
import pandas as pd
import numpy as np
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

# Load datasets
train_df = pd.read_csv("/kaggle/input/playground-series-s5e2/train.csv")
extra_df = pd.read_csv("/kaggle/input/playground-series-s5e2/training_extra.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e2/test.csv")

# Merge train data with extra training data
train_df = pd.concat([train_df, extra_df], ignore_index=True)

# Handle missing values for categorical columns using assignment (avoid chained assignment)
for col in train_df.select_dtypes(include=['object']).columns:
    mode_val = train_df[col].mode()[0]
    train_df[col] = train_df[col].fillna(mode_val)
    if col in test_df.columns:
        test_mode = test_df[col].mode()[0]
        test_df[col] = test_df[col].fillna(test_mode)

# Handle missing values for numeric columns
for col in train_df.select_dtypes(include=['number']).columns:
    median_val = train_df[col].median()
    train_df[col] = train_df[col].fillna(median_val)
    if col in test_df.columns:
        test_median = test_df[col].median()
        test_df[col] = test_df[col].fillna(test_median)

# Encode categorical features
encoder = LabelEncoder()
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

for col in categorical_cols:
    train_df[col] = encoder.fit_transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

# Selecting features and target
X = train_df.drop(columns=['id', 'Price'])
y = train_df['Price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Optuna optimization function
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 200, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 10.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 10.0),
        'tree_method': 'gpu_hist'  # Use GPU acceleration
    }
    
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    return np.sqrt(mse)

# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# Train best model with GPU support
best_params = study.best_params
best_params['tree_method'] = 'gpu_hist'  # Ensure GPU usage

model = xgb.XGBRegressor(**best_params)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Root Mean Squared Error:", rmse)

# Predict on test data
test_X = test_df.drop(columns=['id'])
test_preds = model.predict(test_X)

# Prepare submission file
submission = pd.DataFrame({'id': test_df['id'], 'Price': test_preds})
submission.to_csv("submission.csv", index=False)

print("Submission file created successfully.")


[I 2025-02-28 13:19:44,833] A new study created in memory with name: no-name-4415026a-0da9-4db9-b728-f9210ee8b039

    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2025-02-28 13:20:11,761] Trial 0 finished with value: 38.87853380762206 and parameters: {'n_estimators': 1455, 'learning_rate': 0.016161798438431244, 'max_depth': 3, 'subsample': 0.8099958014019082, 'colsample_bytree': 0.5469125173603444, 'reg_lambda': 4.028947027170934, 'reg_alpha': 5.6596409974760755}. Best is trial 0 with value: 38.87853380762206.

    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

[I 2025-02-28 13:20:25,565] Trial 1 finished with value: 39.00830711659979 and parameters: {'n_estimators': 386, 'learning_rate': 0.13468313218007483, 'max_depth': 9, 'subsample':

Root Mean Squared Error: 38.87594995696349
Submission file created successfully.
