In [50]:
import src.utils as u

from importnb import Notebook

with Notebook():
    u.load_notebook("notebooks.common")
    from notebooks.common import *     

### Load data

In [70]:
df = pd.read_pickle(u.full_path_name(r"data\interim\1_0-mzh-data-preprocessing.pickle"))

## 1. Anlysis of SalePrice distribution

In [52]:
import src.visualization.stats as st

In [53]:
st.plot_feature_distribution(df["SalePrice"])

In [54]:
st.plot_feature_distribution(np.log1p(df["SalePrice"]))

### Sale price transformation

In [55]:
import src.features.engineering as eng

In [56]:
pipeline = eng.create_y_pipeline()
print(f"SalePrice before transformation:\n{df['SalePrice'].describe()}\n")

y = pipeline.fit_transform(pd.DataFrame(df['SalePrice'].copy()))
print(f"SalePrice after transformation:\n{y.describe()}\n")

## 2. Base transformations

In [71]:
df_X = df.copy().drop("SalePrice", axis=1)

In [72]:
from sklearn.preprocessing import StandardScaler

X = df_X.copy()

numerical_features = X.select_dtypes(exclude=['object']).columns
scaler = StandardScaler()

X.loc[:, numerical_features] = scaler.fit_transform(X.loc[:, numerical_features])
print(f"Scaler scales:\n{[(col, scale) for col, scale in zip(numerical_features, scaler.scale_)]}")

In [73]:
X = eng.encode_categorical_features(X)

In [74]:
base_pipeline = eng.create_X_common_pipeline()
X_pipeline = base_pipeline.fit_transform(df_X.copy())

In [75]:
X.equals(X_pipeline)

## 3. PCA

In [76]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X)

In [77]:
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)
plt.plot(range(len(cumulative_explained_variance)), cumulative_explained_variance)

In [78]:
# explained_variance_ratio_
next(i for (i, e) in enumerate(pca.explained_variance_) if e < 0.05)

In [79]:
pca = PCA(n_components=83)
X_pca = pca.fit_transform(X)
X_pca.shape

In [80]:
columns = [f"PC{i}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=columns)

In [81]:
X_pca.columns

In [82]:
X_pca_pipeline = eng.create_X_pca_pipeline(83).fit_transform(X.copy())
X_pca_pipeline.columns

## 4. Correlation

In [86]:
Xy = pd.concat([X, y], axis=1)

In [87]:
corr_matrix = Xy.corr()

In [89]:
potential_features = corr_matrix[corr_matrix["SalePrice"].abs() > 0.4].sort_values(by=["SalePrice"]).index.tolist()

corr_matrix = corr_matrix.loc[potential_features, potential_features]
potential_features

In [91]:
cmap = sns.choose_diverging_palette(as_cmap=True)

plt.subplots(figsize=(11, 9))
hm = sns.heatmap(
    corr_matrix, 
    cbar=True, 
    annot=True, 
    square=True, 
    fmt='.2f', 
    annot_kws={'size': 10},
    linewidth=0.5, 
    cmap=cmap)
hm.set_ylim(len(potential_features), 0)

Remove columns with high pair-wise corrrelation

In [92]:
cols_to_remove = set()
for i, col in enumerate(reversed(potential_features[:-1])):
    col_corr = corr_matrix.iloc[0:len(potential_features)-i-2][col]
    to_remove = col_corr[col_corr.abs() > 0.6].index.tolist()
    if to_remove:
        print(f"{col}: {to_remove}")
        cols_to_remove.update(to_remove)
cols_to_remove

In [93]:
potential_features = [e for e in potential_features if e not in cols_to_remove]
potential_features

In [None]:
X_corr = eng.create_X_correlation_pipeline(target_correlation_threshold=0.4, pair_correlation_threshold=0.6).fit_transform(X.copy(), y.copy())

X_corr.columns