In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

n_train = df_train.shape[0]

df = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

In [3]:
excluded_columns = ['id']
target_column = 'Price'

onehotencode_columns = [
    "Brand",
    "Material",
    "Style",
    "Color",

]

ordinal_columns = {
    "Size": ["Small", "Medium", "Large"],
}

# Generally use for columns with large range, but still relatively small number of unique values
labelencode_columns = [
    "Laptop Compartment",
    "Waterproof",
]

standardscale_columns = [
    "Compartments",
    "Weight Capacity (kg)"
]

encoders = {}


In [4]:
X = df.drop(columns=excluded_columns + [target_column])

for column in onehotencode_columns:
    encoders[column] = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoders[column].fit(df[column].values.reshape(-1, 1))

for column in ordinal_columns:
    encoders[column] = OrdinalEncoder(categories=[ordinal_columns[column]], handle_unknown='use_encoded_value', unknown_value=-1)
    encoders[column].fit(df[column].values.reshape(-1, 1))

for column in labelencode_columns:
    encoders[column] = LabelEncoder()
    encoders[column].fit(df[column].values)

for column in standardscale_columns:
    encoders[column] = StandardScaler()
    encoders[column].fit(df[column].values.reshape(-1, 1))


for column in onehotencode_columns:
    encoded = encoders[column].transform(X[column].values.reshape(-1, 1))
    encoded = pd.DataFrame(encoded, columns=encoders[column].get_feature_names_out())
    X = pd.concat([X, encoded], axis=1)
    X.drop(columns=[column], inplace=True)

for column in ordinal_columns:
    encoded = encoders[column].transform(X[column].values.reshape(-1, 1))
    encoded = pd.DataFrame(encoded, columns=[column])
    X = pd.concat([X, encoded], axis=1)
    X.drop(columns=[column], inplace=True)

for column in labelencode_columns:
    X[column] = encoders[column].transform(X[column].values)

for column in standardscale_columns:
    X[column] = encoders[column].transform(X[column].values.reshape(-1, 1))


# Additionally, label encode the target column
encoders[target_column] = StandardScaler()
encoders[target_column].fit(df[target_column].values.reshape(-1, 1))

y = encoders[target_column].transform(df[target_column].values.reshape(-1, 1))

X_train = X.values[:n_train]
X_test = X.values[n_train:]

y_train = y[:n_train]

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [6]:
model = XGBRegressor(n_estimators=1000, max_depth=5, learning_rate=0.01, n_jobs=-1)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

[0]	validation_0-rmse:0.99755
[1]	validation_0-rmse:0.99754
[2]	validation_0-rmse:0.99752
[3]	validation_0-rmse:0.99751
[4]	validation_0-rmse:0.99749
[5]	validation_0-rmse:0.99748
[6]	validation_0-rmse:0.99747
[7]	validation_0-rmse:0.99746
[8]	validation_0-rmse:0.99744
[9]	validation_0-rmse:0.99743
[10]	validation_0-rmse:0.99742
[11]	validation_0-rmse:0.99741
[12]	validation_0-rmse:0.99740
[13]	validation_0-rmse:0.99739
[14]	validation_0-rmse:0.99738
[15]	validation_0-rmse:0.99737
[16]	validation_0-rmse:0.99736
[17]	validation_0-rmse:0.99735
[18]	validation_0-rmse:0.99734
[19]	validation_0-rmse:0.99733
[20]	validation_0-rmse:0.99732
[21]	validation_0-rmse:0.99732
[22]	validation_0-rmse:0.99731
[23]	validation_0-rmse:0.99730
[24]	validation_0-rmse:0.99729
[25]	validation_0-rmse:0.99728
[26]	validation_0-rmse:0.99727
[27]	validation_0-rmse:0.99726
[28]	validation_0-rmse:0.99726
[29]	validation_0-rmse:0.99725
[30]	validation_0-rmse:0.99724
[31]	validation_0-rmse:0.99723
[32]	validation_0-

In [24]:
y_pred = model.predict(X_test)

y_pred = encoders[target_column].inverse_transform(y_pred.reshape(-1, 1))

df_test[target_column] = y_pred

df_test[['id', target_column]].to_csv('submission.csv', index=False)

In [26]:
features = X.columns
importances = model.feature_importances_

feature_importances = pd.DataFrame({'feature': features, 'importance': importances}).sort_values(by='importance', ascending=False)
feature_importances

Unnamed: 0,feature,importance
11,x0_Leather,0.056355
2,Waterproof,0.053377
4,x0_Adidas,0.049641
12,x0_Nylon,0.047824
3,Weight Capacity (kg),0.045094
22,x0_Green,0.042232
8,x0_Under Armour,0.040835
9,x0_nan,0.040236
19,x0_Black,0.03963
23,x0_Pink,0.037595


In [29]:
y_val_pred = model.predict(X_val)

np.mean((y_val_pred - y_val) ** 2) ** 0.5

: 

: 

In [8]:
model.predict(X_val) - y_val

KeyboardInterrupt: 