In [None]:
import os
os.chdir("../")

In [None]:
TEXT_COLOR = '#313131'
LINE_COLORS = ['#00A082', '#F2CC38', '#9B59B6', '#3498DB', '#F39C12']

sns.set(
    style='darkgrid', 
    rc={'figure.figsize':(6,4),
        'figure.dpi': 150,
        'figure.facecolor': 'w', 
        'legend.facecolor': 'w',
        'text.color': TEXT_COLOR,
        'font.family': 'Microsoft Sans Serif', # 'Open Sans',
        'axes.labelcolor': TEXT_COLOR,
        'xtick.color': TEXT_COLOR,
        'ytick.color': TEXT_COLOR}
)

sns.set_palette(sns.color_palette(LINE_COLORS))

# 1. Load the data

In [None]:
data = pd.read_csv('data/house-prices-dataset/train.csv')

In [None]:
# Remove the outlier
data = data.drop(index=1298, axis=0)

In [None]:
# Selecting top-predictor columns IMO
cols = [
    'OverallQual', 
    'GrLivArea',
    'ExterQual',
    'GarageCars',
    'YearBuilt',
    'YearRemodAdd',
    'TotRmsAbvGrd',
    'Foundation',
    'Fireplaces',
    'FireplaceQu',
    'HeatingQC',
    'SalePrice'
]

In [None]:
data = data[cols]

In [None]:
def col_to_dummies(df, col):
    return pd.concat(
        [data, pd.get_dummies(data[col], prefix=col, drop_first=True)], 
        axis=1
    ).drop(col, axis=1)

In [None]:
data['FireplaceQu'] = data['FireplaceQu'].map({
    np.nan: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5
})

In [None]:
data['HeatingQC'] = data['HeatingQC'].map({
    'Po':0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4
})

In [None]:
data['ExterQual'] = data['ExterQual'].map({
    'Po':0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4
})

In [None]:
data = col_to_dummies(data, 'Foundation')

In [None]:
x_cols = data.columns.drop("SalePrice").tolist()

In [None]:
X = data[x_cols].copy()

In [None]:
y = data["SalePrice"].copy()

# 2. Apply k-means and PCA in cross-validation

In [None]:
from lib.clustering.k_means import apply_k_means
from lib.preprocessing.pca import apply_pca
from lib.preprocessing.scale import standard_scale
from lib.evaluation.cross_validate import perform_cross_validation
from lib.evaluation.regression import evaluate_regression
from sklearn.ensemble import RandomForestRegressor

In [None]:
def preprocess(X_train, X_valid, y_train, y_valid, kmeans=True, pca=True, target_log=True):
    """Ensures preprocessing without data leakage."""
    ncols = X_train.shape[1]
    
    X_train, X_valid, y_train, y_valid = X_train.values, X_valid.values, y_train.values, y_valid.values
    
    X_train, X_valid = standard_scale(X_train, X_valid)
    
    if pca is True:
        X_train, X_valid = apply_pca(X_train, X_valid, ncols=ncols)
        
    if kmeans is True: 
        X_train, X_valid = apply_k_means(X_train, X_valid, ncols=ncols)
    
    if target_log:
        y_train = np.log(y_train)
    
    return X_train, y_train, X_valid, y_valid

In [None]:
cv_options = [
    {"kmeans": False, "pca": False},
    {"kmeans": True, "pca": False},
    {"kmeans": False, "pca": True},
    {"kmeans": True, "pca": True},
]

Apply to a NON-linear model

In [None]:
model = RandomForestRegressor(n_estimators=500, max_features=0.5, random_state=42)

In [None]:
results = pd.DataFrame()

for kwargs in cv_options:
    print(kwargs)
    metrics = perform_cross_validation(
        X, 
        y, 
        model, 
        preprocess_func=preprocess, 
        eval_func=evaluate_regression, 
        cv_folds=10,
        **kwargs,
    )
    result = pd.DataFrame(data=metrics, index=[str(kwargs)])
    results = pd.concat((results, result), axis=0)

In [None]:
sns.heatmap(data=results, vmin=0, vmax=40000, annot=True, fmt='g', cmap='RdYlGn_r')
plt.title("Metric performance")

Apply to a linear model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

In [None]:
results = pd.DataFrame()

for kwargs in cv_options:
    print(kwargs)
    metrics = perform_cross_validation(
        X, 
        y, 
        model, 
        preprocess_func=preprocess, 
        eval_func=evaluate_regression, 
        cv_folds=10,
        **kwargs,
    )
    result = pd.DataFrame(data=metrics, index=[str(kwargs)])
    results = pd.concat((results, result), axis=0)

In [None]:
sns.heatmap(data=results, vmin=0, vmax=40000, annot=True, fmt='g', cmap='RdYlGn_r')
plt.title("Metric performance")

# Conclusion
Linear regression explodes when using PCA as features!

Using k-means as feature improves performance minimally, at least for this specific data set.

---

# 3. Explore how k-means works

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

### 3.1. Apply to raw data

In [None]:
km = KMeans(n_clusters=3, random_state=42)

In [None]:
km.fit(data[x_cols])

In [None]:
data['cluster'] = km.labels_

In [None]:
sns.scatterplot(data=data, x='GrLivArea', y='SalePrice', hue='cluster', alpha=0.5)

Seems like GrLivArea was the most decisive factor to assign clusters because it is of large numbers.

### 3.2. Apply to scaled data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaled_data = data.drop("cluster", axis=1).copy()

In [None]:
scaled_data[x_cols] = scaler.fit_transform(scaled_data[x_cols])

In [None]:
km = KMeans(n_clusters=3, random_state=42)

In [None]:
km.fit(scaled_data[x_cols])

In [None]:
scaled_data['cluster'] = km.labels_

In [None]:
sns.scatterplot(data=scaled_data, x='GrLivArea', y='SalePrice', hue='cluster', alpha=0.5)

### Plot principal components of PCA vs clusters

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA()

In [None]:
pcomps = pca.fit_transform(scaled_data[x_cols])

In [None]:
pc_cols = [f"PC{n+1}" for n in range(pcomps.shape[1])]

In [None]:
scaled_data[pc_cols] = pcomps

In [None]:
sns.scatterplot(data=scaled_data, x='PC1', y='PC2', hue='cluster', alpha=0.8)

In [None]:
sns.scatterplot(data=scaled_data, x='PC1', y='SalePrice', hue='cluster', alpha=0.8)

### Apply k-means on principal components

In [None]:
km = KMeans(n_clusters=3, random_state=42)

In [None]:
km.fit(scaled_data[pc_cols])

In [None]:
scaled_data['pca_cluster'] = km.labels_

In [None]:
sns.scatterplot(data=scaled_data, x='PC1', y='PC2', hue='pca_cluster', alpha=0.8)

If we apply clustering with all principal compotents, we get the same resulting clustering labels.