In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# We concatenate the training and test sets to perform the same transformations on both datasets. If this wasn't the case then the LabelEncoder would have to be refitted on the test set. This might sometimes work, but often we see new categories of data in the test set that we still want to encode.
df_concat = pd.concat([df_train, df_test])

In [3]:
# 1 - Seperate features and target
X = df_concat.drop(["SalePrice", "Id"], axis=1)
y = df_concat["SalePrice"]

# 2 - Encode categorical features
categorical_columns = X.select_dtypes(include=["object"]).columns
numerical_columns = X.select_dtypes(include=["int64", "float64"]).columns

# Label encode the categorical columns
label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    X[col] = X[col].fillna("missing")  # Fill missing values with 'missing'
    X[col] = label_encoders[col].fit_transform(X[col])  # Label encode the column

imputer = SimpleImputer(strategy="mean")
X[numerical_columns] = imputer.fit_transform(X[numerical_columns])

# 3 - Normalise features and target
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X = pd.DataFrame(X_scaler.fit_transform(X), columns=X.columns)
y = pd.Series(y_scaler.fit_transform(y.values.reshape(-1, 1)).ravel())


In [4]:
X, X_test = X.iloc[:df_train.shape[0]], X.iloc[df_train.shape[0]:]
y, y_test = y.iloc[:df_train.shape[0]], y.iloc[df_train.shape[0]:]

In [5]:
# You can see that we haven't magically created a new set of y values for the test set.

print(y.head(10))
print()
print(y_test.head(10))

0    0.347273
1    0.007288
2    0.536154
3   -0.515281
4    0.869843
5   -0.477505
6    1.587588
7    0.240241
8   -0.642461
9   -0.792306
dtype: float64

1460   NaN
1461   NaN
1462   NaN
1463   NaN
1464   NaN
1465   NaN
1466   NaN
1467   NaN
1468   NaN
1469   NaN
dtype: float64


In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1168, 79), (292, 79), (1168,), (292,))

In [8]:
model = XGBRegressor(
    learning_rate = 0.01,
    max_depth=5,
    n_estimators=1000,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    objective="reg:squarederror"
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=50,
)

[0]	validation_0-rmse:1.09496
[50]	validation_0-rmse:0.78567
[100]	validation_0-rmse:0.59598
[150]	validation_0-rmse:0.48295
[200]	validation_0-rmse:0.41601
[250]	validation_0-rmse:0.37607
[300]	validation_0-rmse:0.35110
[350]	validation_0-rmse:0.33914
[400]	validation_0-rmse:0.33160
[450]	validation_0-rmse:0.32602
[500]	validation_0-rmse:0.32233
[550]	validation_0-rmse:0.31946
[600]	validation_0-rmse:0.31827
[650]	validation_0-rmse:0.31695
[700]	validation_0-rmse:0.31578
[750]	validation_0-rmse:0.31468
[800]	validation_0-rmse:0.31421
[850]	validation_0-rmse:0.31355
[900]	validation_0-rmse:0.31284
[950]	validation_0-rmse:0.31228
[999]	validation_0-rmse:0.31166


In [9]:
y_pred = pd.Series(model.predict(X_val))

y_val_actual = y_scaler.inverse_transform(y_val.values.reshape(-1, 1))
y_pred_actual = y_scaler.inverse_transform(y_pred.values.reshape(-1,1))

In [10]:
df = pd.DataFrame({
    "y_val_actual": y_val_actual[:,0],
    "y_pred_actual": y_pred_actual[:,0]
})
df.head()

Unnamed: 0,y_val_actual,y_pred_actual
0,154500.0,141652.328125
1,325000.0,338284.71875
2,115000.0,112501.429688
3,159000.0,164046.78125
4,315500.0,331366.34375


In [11]:
y_test_pred = pd.Series(model.predict(X_test))
y_submission = y_scaler.inverse_transform(y_test_pred.values.reshape(-1,1))

df_submission = pd.DataFrame({
    "Id": df_test["Id"],
    "SalePrice": y_submission[:,0]
})
df_submission.to_csv("submission.csv", index=False)