In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

In [None]:
train_data.head()

Let's first take a look at our target variable:

In [None]:
ax = sns.histplot(train_data["SalePrice"], kde=True)

It's clear that the distribution of SalePrice has a positive skew and is not exactly normal.

In [None]:
numeric_columns = train_data.select_dtypes(include=[np.number]).columns
n_cols = int(np.ceil(len(numeric_columns) / 6))

fig, ax = plt.subplots(nrows=6, ncols=n_cols, figsize=(20,30), squeeze=False)
for i, col_name in enumerate(numeric_columns):
    row = i // n_cols
    col = i % n_cols
    sns.histplot(train_data[col_name], kde=False, ax=ax[row][col])
    ax[row][col].set_title(col_name)
plt.tight_layout()
plt.show()

Well, many variables are not normally distributed. Also, many of them seem to be count variables with discrete values. Some have very strong skew and kurtosis, while others have zero-inflated distributions. These facts are important to understand before fitting a model.

Let's also have a look at correlation matrix plots to see if we can spot some obvious or interesting correlations. We'll use Spearman correlation since most variables are not normally distributed.

In [None]:
numeric_columns = train_data.select_dtypes(include=[np.number]).columns

fig, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(train_data[numeric_columns].corr())

Some variables seem to provide little extra information over others, for example, YearBuilt and GarageYrBlt. This means there is quite a lot of multicolinearity in the data. This is very relevant if we want to fit a linear model.

Let's finally take a look at the variables with the highest correlation with the target variable.

In [None]:
print(train_data[numeric_columns].corr()["SalePrice"].sort_values(ascending=False)[:20])

Overall quality, ground living area, year built, garage capacity and the number of full bathrooms are the variables most correlated with the target variable.

One critical issue for our sale price predictor is (potentially) missing data. Let's take a look at it:

In [None]:
n_train = len(train_data)
all_data = pd.concat((train_data, test_data))
y_train = train_data["SalePrice"].values
all_data = all_data.drop(["SalePrice"], axis=1)


na_proportion = all_data.isnull().sum() / len(all_data) * 100
na_proportion = na_proportion.drop(na_proportion[na_proportion == 0].index)
na_proportion = na_proportion.sort_values(ascending=False)
na_proportion = pd.DataFrame({"NA proportion": na_proportion})
na_proportion.head(50)

According to the data description, many of those NA values are actually "not present" values. Therefore we can replace them:

In [None]:
fillna_cols = [
    "PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu",
    "GarageType", "GarageFinish", "GarageQual", "GarageCond",
    "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
    "BsmtFinType2", "MasVnrType", "MSSubClass",
]

for col in fillna_cols:
    all_data[col] = all_data[col].fillna("None")

For the other variables, we'll use either the median/mean/mode or 0:

In [None]:
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)
all_data["GarageYrBlt"] = all_data["GarageYrBlt"].fillna(all_data["GarageYrBlt"].mean())

We could fill GarageYrBlt with 0 since there's no garage and, in practive, the variable is not defined. However, that would zero-inflate the variable and potentially distort its distribution - especially since years start at least in the 1800s range, making 0 a very strong outlier.

In [None]:
_ = sns.histplot(all_data["MasVnrArea"], kde=True)

MasVnrArea is already severely zero-inflated, so we'll fill the missing values with zero.

In [None]:
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)

In [None]:
all_data["BsmtFullBath"] = all_data["BsmtFullBath"].fillna(all_data["BsmtFullBath"].mode()[0])
all_data["Utilities"] = all_data["Utilities"].fillna(all_data["Utilities"].mode()[0])
all_data["Functional"] = all_data["Functional"].fillna(all_data["Functional"].mode()[0])
all_data["MSZoning"] = all_data["MSZoning"].fillna(all_data["MSZoning"].mode()[0])
all_data["Exterior1st"] = all_data["Exterior1st"].fillna(all_data["Exterior1st"].mode()[0])
all_data["Exterior2nd"] = all_data["Exterior2nd"].fillna(all_data["Exterior2nd"].mode()[0])
all_data["Electrical"] = all_data["Electrical"].fillna(all_data["Electrical"].mode()[0])
all_data["KitchenQual"] = all_data["KitchenQual"].fillna(all_data["KitchenQual"].mode()[0])
all_data["SaleType"] = all_data["SaleType"].fillna(all_data["SaleType"].mode()[0])

In [None]:
for col in ("GarageArea", "GarageCars", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath"):
    all_data[col] = all_data[col].fillna(0)

In [None]:
na_proportion = all_data.isnull().sum() / len(all_data) * 100
na_proportion = na_proportion.drop(na_proportion[na_proportion == 0].index)
na_proportion = na_proportion.sort_values(ascending=False)
na_proportion = pd.DataFrame({"NA proportion": na_proportion})
na_proportion.head(50)

Many categorical variables have orders. Let's encode them in the correct order:

In [None]:
from category_encoders import OrdinalEncoder

col_mapping = [
    {
        "col": "FireplaceQu",
        "mapping": ["Ex", "Gd", "TA", "Fa", "Po", "None"],
    },
    {
        "col": "BsmtQual",
        "mapping": ["Ex", "Gd", "TA", "Fa", "Po", "None"],
    },
    {
        "col": "BsmtCond",
        "mapping": ["Ex", "Gd", "TA", "Fa", "Po", "None"],
    },
    {
        "col": "GarageQual",
        "mapping": ["Ex", "Gd", "TA", "Fa", "Po", "None"],
    },
    {
        "col": "GarageCond",
        "mapping": ["Ex", "Gd", "TA", "Fa", "Po", "None"],
    },
    {
        "col": "ExterQual",
        "mapping": ["Ex", "Gd", "TA", "Fa", "Po", "None"],
    },
    {
        "col": "ExterCond",
        "mapping": ["Ex", "Gd", "TA", "Fa", "Po", "None"],
    },
    {
        "col": "HeatingQC",
        "mapping": ["Ex", "Gd", "TA", "Fa", "Po", "None"],
    },
    {
        "col": "PoolQC",
        "mapping": ["Ex", "Gd", "TA", "Fa", "Po", "None"],
    },
    {
        "col": "KitchenQual",
        "mapping": ["Ex", "Gd", "TA", "Fa", "Po", "None"],
    },
    {
        "col": "BsmtFinType1",
        "mapping": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "None"],
    },
    {
        "col": "BsmtFinType2",
        "mapping": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "None"],
    },
    {
        "col": "Functional",
        "mapping": ["Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal"],
    },
    {
        "col": "Fence",
        "mapping": ["GdPrv", "MnPrv", "GdWo", "MnWw", "None"],
    },
    {
        "col": "BsmtExposure",
        "mapping": ["Gd", "Av", "Mn", "No", "None"],
    },
    {
        "col": "GarageFinish",
        "mapping": ["Fin", "RFn", "Unf", "None"],
    },
    {
        "col": "LandSlope",
        "mapping": ["Gtl", "Mod", "Sev"],
    },
    {
        "col": "LotShape",
        "mapping": ["Reg", "IR1", "IR2", "IR3"],
    },
]

for mapping_dict in col_mapping:
    mapping_dict["mapping"] = {x: i for i, x in enumerate(mapping_dict["mapping"])}

ordinal_encoder = OrdinalEncoder(mapping=col_mapping)

ordinal_encoder.fit(all_data)
all_data_encoded = ordinal_encoder.transform(all_data)