In [None]:
%reset -f
%matplotlib inline

In [None]:
import calendar
import numpy as np
import pandas as pd
import seaborn as sns
import sebaba.ml as sbbml
import sebaba.utils as utils
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
from scipy.stats import norm
from missingpy import KNNImputer

In [None]:
pd.options.display.float_format = "{:.4f}".format

In [None]:
df = pd.read_csv("data/house-prices/train.csv", sep = ",")
df = df.drop("Id", axis = 1)

In [None]:
utils.missing_var_pct(df)

In [None]:
df = utils.drop_missing_var(df, threshold = 0.8)

In [None]:
df = df.drop("GarageYrBlt", axis = 1)
df = df.dropna(how = "any", subset = ["MasVnrType", "MasVnrArea", "Electrical"])

In [None]:
df.FireplaceQu = df.FireplaceQu.fillna("NoFirePlace")
basement       = ["BsmtFinType2", "BsmtExposure", "BsmtFinType1", "BsmtCond", "BsmtQual"]
df[basement]   = df[basement].fillna("NoBasement")
garage         = ["GarageType", "GarageFinish", "GarageQual", "GarageCond"]
df[garage]     = df[garage].fillna("NoGarage")

In [None]:
df = df.dropna(how = "any", subset = ["MasVnrType", "MasVnrArea", "Electrical"])

In [None]:
imputer        = KNNImputer(n_neighbors = 5, weights = "distance", metric = "masked_euclidean")
df.LotFrontage = imputer.fit_transform(np.array(df.LotFrontage).reshape(-1, 1))

In [None]:
utils.missing_var_pct(df)

In [None]:
#changing numeric variables to categorical
df.MSSubClass = ["SC" + str(i) for i in df.MSSubClass]
df.MoSold     = [calendar.month_abbr[i] for i in df.MoSold]

In [None]:
#converting cat variables to an interval scale as they are ordinal in nature
df = df.replace({
    "GarageFinish": {"NoGarage": 0, "Unf": 1, "RFn": 2, "Fin": 3},
    "GarageQual"  : {"NoGarage": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "GarageCond"  : {"NoGarage": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "BsmtQual"    : {"NoBasement": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "BsmtCond"    : {"NoBasement": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "BsmtExposure": {"NoBasement": 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4},
    "BsmtFinType1": {"NoBasement": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6},
    "BsmtFinType2": {"NoBasement": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6},
    "ExterQual"   : {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "ExterCond"   : {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "HeatingQC"   : {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "KitchenQual" : {"Po": 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
    "FireplaceQu" : {"NoFirePlace": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex" : 5}
})

#creating a list of our ordinal variables
ordinal_vars = [
    "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure",
    "BsmtFinType1", "BsmtFinType2", "HeatingQC","KitchenQual", 
    "FireplaceQu", "GarageFinish", "GarageQual", "GarageCond"
]

In [None]:
#changing features to their correct data types
df.BsmtCond     = df.BsmtCond.astype("int64")
df.BsmtFinType2 = df.BsmtFinType2.astype("int64")
df.FireplaceQu  = df.FireplaceQu.astype("int64")

In [None]:
#one hot encoding
#df = pd.get_dummies(df)
df["LogSalePrice"] = np.log(df.SalePrice)
df.drop("SalePrice", axis = 1, inplace = True)

In [None]:
fig, ax = plt.subplots(figsize = (10.0, 6.5))
ax = sns.distplot(df.LogSalePrice, color = "darkblue", fit = norm, kde = False, hist_kws = {"edgecolor": "silver"})
ax.set_title("Distribution of LogSalePrice", fontsize = 20)
ax.set_xlabel("LogSalePrice", fontsize = 18)
ax.margins(0.05)
ax.axis("tight")
ax.grid(True)
fig.tight_layout()

plt.show()

In [None]:
num_df, cat_df = utils.split_numerical_categorical(df)

In [None]:
corr = num_df.corr()

In [None]:
#generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True

cmap = sns.diverging_palette(220, 10, as_cmap = True)

fig, ax = plt.subplots(figsize = (20, 16))
ax = sns.heatmap(corr, mask = mask, cmap = cmap, fmt = ".2f", vmin = -1, vmax = 1.0, center = 0, square = True, linewidths = .5, cbar_kws = {"shrink": .7}, annot = True, annot_kws = {"size": 8})
ax.set_title("Correlation Heatmap", fontsize = 20)
ax.margins(0.05)
ax.axis("tight")
ax.grid(False)
fig.tight_layout()

plt.show()

In [None]:
y = num_df.LogSalePrice.values
x = num_df.drop("LogSalePrice", axis = 1).values

In [None]:
x_train, x_test, y_train, y_test = utils.split_train_test(x, y, prop_train = 80.0)

In [None]:
opt_model  = None
min_rmse   = np.inf
alpha_list = np.arange(0, 10, step = 0.01)

for alpha in alpha_list:
    model = sbbml.LinearRegression(alpha, iterations = 10000, normalize = True)
    model.fit(x_train, y_train)

    y_prime = model.predict(x_test)
    rmse    = utils.root_mean_squared_error(y_prime, y_test)

    if rmse < min_rmse:
        print(f"rmse: {rmse: <20} \t alpha: {alpha}")
        min_rmse  = rmse
        opt_model = model

In [None]:
utils.plot_cost_function(cost = opt_model.cost, width = 10.0, height = 6.5)

In [None]:
opt_model  = None
opt_n_comp = None
min_rmse   = np.inf

alpha_list = np.arange(0, 10, step = 0.01)
ncomp_list = np.linspace(0, 10, num = 10, dtype = int, endpoint = True)

for n_comp in ncomp_list:
    if n_comp > 3:
        pca = sbbml.PCA(n_comp)
        pca.fit(x)
        x_reduced = pca.transform(x)
        x_train, x_test, y_train, y_test = utils.split_train_test(x_reduced, y, prop_train = 80.0, seed = 0)

        for alpha in alpha_list:
            model = sbbml.LinearRegression(alpha, iterations = 10000, normalize = True)
            model.fit(x_train, y_train)

            y_prime = model.predict(x_test)
            rmse    = utils.root_mean_squared_error(y_prime, y_test)

            if rmse < min_rmse:
                print(f"rmse: {rmse: <20} \t alpha: {alpha: <20} \t n_comp: {n_comp}")
                min_rmse   = rmse
                opt_model  = model
                opt_n_comp = n_comp

In [None]:
utils.plot_cost_function(cost = opt_model.cost, width = 10.0, height = 6.5)