In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

y_train = train_data.pop("price")

n_train = len(train_data)
all_data = pd.concat((train_data, test_data), axis=0)


In [None]:
all_data.head()

In [None]:
ax = sns.histplot(y_train, kde=True)

In [None]:
from scipy import stats

y_train_log = np.log1p(y_train)
sns.histplot(y_train_log, kde=True)
plt.show()
ax = stats.probplot(y_train_log, plot=plt)
plt.show()


In [None]:
numeric_columns = all_data.select_dtypes(include=[np.number]).columns.drop("id")
n_cols = int(np.ceil(len(numeric_columns) / 3))

fig, ax = plt.subplots(nrows=3, ncols=n_cols, figsize=(10, 10), squeeze=False)
for i, col_name in enumerate(numeric_columns):
    row = i // n_cols
    col = i % n_cols
    sns.histplot(all_data[col_name], kde=False, ax=ax[row][col])
    ax[row][col].set_title(col_name)
plt.tight_layout()
plt.show()


In [None]:
numeric_columns = all_data.select_dtypes(include=[np.number]).columns.drop("id")
n_cols = int(np.ceil(len(numeric_columns) / 2))

fig, ax = plt.subplots(nrows=2, ncols=n_cols, figsize=(8, 8), squeeze=False)
for i, col_name in enumerate(numeric_columns):
    row = i // n_cols
    col = i % n_cols
    sns.boxplot(
        y=train_data[col_name], x=(y_train_log > y_train_log.median()).astype(int), ax=ax[row][col]
    )
    ax[row][col].set_title(col_name)
plt.tight_layout()
plt.show()


In [None]:
categorical_columns = all_data.select_dtypes(include="object").columns
fig, axes = plt.subplots(ncols=3, figsize=(14, 4), squeeze=True)


orders = {
    "cut": ["Ideal", "Premium", "Very Good", "Good", "Fair"],
    "color": ["D", "E", "F", "G", "H", "I", "J"],
    "clarity": ["FL", "IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"],
}


for col_name, ax in zip(categorical_columns, axes):
    sns.boxplot(y=y_train, x=train_data[col_name], ax=ax, order=orders[col_name])
    ax.set_title(col_name)
    ax.set_axis_on()

plt.tight_layout()
plt.show()


It's interesting that worse cuts, colors and clarities present a tendency of higher prices. This is completely unintuitive. Likely, this is explained by the interaction with other variables. For instance, good clarity gemstones may be smaller on average, driving the price down.

In [None]:
ax = sns.jointplot(x=train_data["carat"], y=y_train_log, kind="hex")
plt.show()
ax = sns.jointplot(x=train_data["carat"], y=y_train_log, hue=train_data["clarity"], hue_order=orders["clarity"], alpha=0.5)


In [None]:
all_data["volume"] = all_data["x"] * all_data["y"] * all_data["z"]
all_data["volume"].round(2)

ax = sns.jointplot(x=all_data.iloc[:n_train]["volume"], y=y_train_log, hue=all_data.iloc[:n_train]["clarity"], hue_order=orders["clarity"], alpha=0.5)

For the same volume or carat, we can see that there's a tendency that gemstones with worse clarities have lower prices.

Let's encode ordinal variables and add new variables based on the existing ones. Check XXX for more details about the new variables.

In [None]:
# https://www.kaggle.com/competitions/playground-series-s3e8/discussion/389207

orders_dict = {
    k: {vv: i for i, vv in enumerate(v)}
    for k, v in orders.items()
}

all_data['cut'] = all_data['cut'].apply(lambda x: orders_dict["cut"][x])
all_data['color'] = all_data['color'].apply(lambda x:orders_dict["color"][x])
all_data['clarity'] = all_data['clarity'].apply(lambda x:orders_dict["clarity"][x])
all_data["volume"] = all_data["x"] * all_data["y"] * all_data["z"]
all_data["surface_area"] = 2 * (all_data["x"] * all_data["y"] + all_data["y"] * all_data["z"] + all_data["z"] * all_data["x"])
# all_data["aspect_ratio_xy"] = all_data["x"] / all_data["y"]
# all_data["aspect_ratio_yz"] = all_data["y"] / all_data["z"]
# all_data["aspect_ratio_zx"] = all_data["z"] / all_data["x"]
all_data["diagonal_distance"] = np.sqrt(all_data["x"] ** 2 + all_data["y"] ** 2 + all_data["z"] ** 2)
all_data["relative_height"] = (all_data["z"] - all_data["z"].min()) / (all_data["z"].max() - all_data["z"].min())
all_data["relative_position"] = (all_data["x"] + all_data["y"] + all_data["z"]) / (all_data["x"] + all_data["y"] + all_data["z"]).sum()
all_data["volume_ratio"] = all_data["x"] * all_data["y"] * all_data["z"] / (all_data["x"].mean() * all_data["y"].mean() * all_data["z"].mean())
all_data["length_ratio"] = all_data["x"] / all_data["x"].mean()
all_data["width_ratio"] = all_data["y"] / all_data["y"].mean()
all_data["height_ratio"] = all_data["z"] / all_data["z"].mean()
all_data["sphericity"] = 1.4641 * (6 * all_data["volume"])**(2/3) / (1e-4 + all_data["surface_area"])
# all_data["compactness"] = all_data["volume"]**(1/3) / all_data["x"]

In [None]:
import multiprocessing

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

def rmse(y, pred):
    return np.sqrt(mean_squared_error(y, pred))


model = xgb.XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2, eval_metric=rmse)
clf = GridSearchCV(
    model,
    {"max_depth": [3, 4], "n_estimators": [1_000, 2_500, 5_000, 10_000]},
    verbose=1,
    n_jobs=2,
    scoring="neg_root_mean_squared_error",
)

clf.fit(all_data.iloc[:n_train], y_train)
print(clf.best_score_)
print(clf.best_params_)