In [11]:
## Setting the system with the necessary imports

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

In [16]:
## Set the matplotlib defaults

plt.style.use('seaborn-v0_8-whitegrid')
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large', titleweight='bold',titlesize=14, titlepad=10,)


In [2]:
# Setting each function it its own cell makes code a lot more cleaner and debugging easier
# forgive me when you see me do this :)

def apply_pca(X, standardize=True):
    
    if standardize:
        X = (X - X.mean(axis=0)) /X.std(axis=0)
        
    # to create principal components
    pca = PCA()
    X_pca =pca.fit_transform(X)
    
    # to convert to dataframe
    components_names = [f"PC{i+1}" for i in range(X_pca.chape[1])]
    X_pca = pd.DataFrame(X_pca, columns=components_names)

    #create loadings
    loadings = pd.DataFrame(pca.components_.T,
                            columns=components_names,
                            index=X.columns,
                            )
    return pca, X_pca, loadings

In [8]:
def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplot(1,2)
    n = pca.n_components_
    grid = np.arange(1, n+1)

    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid,evr)
    axs[0].set(
        xlabel='Component', title= '% Explained Variance', ylim=(0.0, 1.0)
    )

    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0,grid], np.r_[0, cv], 'o-')
    axs[1].set(
        xlabel ='Component', title='% Cumulative Variance', ylim=(0.0, 1.0)
    )

    # Set up figure
    fig.set(figwidth=8, dpi=100)
    return axs

In [9]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(['object', 'category']):
        X[colname],  _ = X[colname].factorize()

    # All discrete features should now have integers dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name='MI Scores', index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [12]:
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(['category', 'object']):
        X[colname], _= X[colname].factorize()

    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv =5, scoring='neg_mean_squared_log_error', 
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

In [15]:
# Importing the csv file
df =  pd.read_csv('./input/data/ames.csv')
print(df[:3])

                            MSSubClass  ... SalePrice
0  One_Story_1946_and_Newer_All_Styles  ...    215000
1  One_Story_1946_and_Newer_All_Styles  ...    105000
2  One_Story_1946_and_Newer_All_Styles  ...    172000

[3 rows x 79 columns]
