In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import seaborn as sns

# Data Preprocessing

In [2]:
df1 = pd.read_csv("/kaggle/input/playground-series-s5e2/training_extra.csv")
df2 = pd.read_csv("/kaggle/input/playground-series-s5e2/train.csv")
df = pd.concat([df1, df2])

In [3]:
df.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,500000,Under Armour,Canvas,Small,10.0,Yes,Yes,Tote,Blue,23.882052,114.11068
1,500001,Puma,Polyester,Small,4.0,No,Yes,Backpack,Green,11.869095,129.74972
2,500002,Jansport,Polyester,Small,8.0,Yes,Yes,Tote,Red,8.092302,21.3737
3,500003,Nike,Nylon,Large,7.0,No,No,Messenger,Pink,7.719581,48.09209
4,500004,Nike,Leather,Large,9.0,No,Yes,Tote,Green,22.741826,77.32461


In [4]:
for col in df.columns.tolist():
    df[col] = df[col].fillna(df[col].mode())

In [5]:
def feature_engineer(df):
    df["is_affordable"] = (df["Brand"] == "Jansport").astype(int)
    df["is_expensive_material"] = df["Material"].isin(["Leather", "Polyester"]).astype(int)
    df["has_laptop_compartment"] = df["Laptop Compartment"].map({"Yes": 1, "No": 0})
    df["is_waterproof"] = df["Waterproof"].map({"Yes": 1, "No": 0})
    df["utility_score"] = df["Compartments"] + df["has_laptop_compartment"]
    df["size_encoded"] = df["Size"].map({"Large": 3, "Medium": 2, "Small": 1})
    return df

In [6]:
df = feature_engineer(df)

In [7]:
features = df.columns.tolist()
features.remove("id")
features.remove("Price")
X = df[features]
y = df.Price

In [8]:
X = pd.get_dummies(X, dtype=float)

In [9]:
y = y.fillna(y.mode())

In [10]:
to_be_normalized = ["Weight Capacity (kg)", "Compartments"]
max_vals = [X[col].max() for col in to_be_normalized]
min_vals = [X[col].min() for col in to_be_normalized]
for i, col in enumerate(to_be_normalized):
    X[col] = (X[col] - min_vals[i]) / (max_vals[i] - min_vals[i])

In [11]:
X_train, X_inter, y_train, y_inter = train_test_split(X, y, test_size=0.3)
X_valid, X_cv, y_valid, y_cv = train_test_split(X_inter, y_inter, test_size=0.5)

# Training a model

In [12]:
model = xgb.XGBRegressor(n_estimators=10_000, early_stopping_rounds=5, max_depth=4, colsample_bytree=0.7, learning_rate=0.1, subsample=0.7)

In [13]:
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

[0]	validation_0-rmse:38.97755
[1]	validation_0-rmse:38.97410
[2]	validation_0-rmse:38.97118
[3]	validation_0-rmse:38.96929
[4]	validation_0-rmse:38.96696
[5]	validation_0-rmse:38.96426
[6]	validation_0-rmse:38.96201
[7]	validation_0-rmse:38.96014
[8]	validation_0-rmse:38.95857
[9]	validation_0-rmse:38.95726
[10]	validation_0-rmse:38.95608
[11]	validation_0-rmse:38.95494
[12]	validation_0-rmse:38.95359
[13]	validation_0-rmse:38.95257
[14]	validation_0-rmse:38.95137
[15]	validation_0-rmse:38.95069
[16]	validation_0-rmse:38.94991
[17]	validation_0-rmse:38.94901
[18]	validation_0-rmse:38.94849
[19]	validation_0-rmse:38.94795
[20]	validation_0-rmse:38.94735
[21]	validation_0-rmse:38.94645
[22]	validation_0-rmse:38.94576
[23]	validation_0-rmse:38.94516
[24]	validation_0-rmse:38.94471
[25]	validation_0-rmse:38.94406
[26]	validation_0-rmse:38.94352
[27]	validation_0-rmse:38.94301
[28]	validation_0-rmse:38.94255
[29]	validation_0-rmse:38.94211
[30]	validation_0-rmse:38.94181
[31]	validation_0-

In [14]:
X_test = pd.read_csv("/kaggle/input/playground-series-s5e2/test.csv")
id_col = X_test.id
X_test = feature_engineer(X_test)
X_test = X_test[features]

In [15]:
X_test = pd.get_dummies(X_test, dtype=float)

In [16]:
for col in X_test.columns.tolist():
    X_test[col] = X_test[col].fillna(X_test[col].mode())

In [17]:
max_vals = [X_test[col].max() for col in to_be_normalized]
min_vals = [X_test[col].min() for col in to_be_normalized]
for i, col in enumerate(to_be_normalized):
    X_test[col] = (X_test[col] - min_vals[i]) / (max_vals[i] - min_vals[i])

In [18]:
preds = model.predict(X_test)

In [19]:
output = pd.DataFrame({
    "id": id_col,
    "Price": preds
})
output.to_csv("submission.csv", index=False)

In [20]:
cv_score = np.sqrt(mean_squared_error(model.predict(X_cv), y_cv))

In [21]:
cv_score

38.85270410848297