In [1]:
# kaggle competition src:
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/

In [2]:
# Load libraries

import os

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import numpy as np
from scipy import stats

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import cross_val_score, KFold, train_test_split, GridSearchCV
    
from sklearn.linear_model import LassoCV, RidgeCV, LinearRegression, ElasticNetCV, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from xgboost import XGBRegressor

In [3]:
# Load datasets

data_dir_path = os.path.join(os.getcwd(), 'data')

train_df = pd.read_csv(os.path.join(data_dir_path, 'train.csv')) 
test_df = pd.read_csv(os.path.join(data_dir_path, 'test.csv'))

In [4]:
concat_train_test_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)

concat_train_test_df = concat_train_test_df.drop('Id', axis=1)

In [5]:
# Explore loaded data.

def display_dataset_overview(dataset_df):
    """Display basic information about dataset"""
    # Data inside
    display(dataset_df.head(3))
    display(dataset_df.tail(3))
    # Shape
    display(dataset_df.shape)
    # .describe output
    display(dataset_df.describe(include='all').T)
    

def display_dataset_col_dtypes(dataset_df):
    """Display dataset columns and its dtypes"""
    # All columns and their dtypes
    display(dataset_df.dtypes.unique())
    display(dataset_df.select_dtypes(include='int64').columns.values)
    display(dataset_df.select_dtypes(include='float64').columns.values)
    display(dataset_df.select_dtypes(include='object').columns.values)
    display(dataset_df.select_dtypes(include='number').columns.values)

In [6]:
# display_dataset_overview(train_df)

# display_dataset_overview(test_df)

# display_dataset_overview(concat_train_test_df)

In [7]:
# Explore distributions of continuous features in certain dataset

def display_hist(dataset_df, col_name, n_bins=25):
    """Display histogram for dataset[col_name] values"""
    plt.figure(figsize=(15, 10))
    dataset_df[col_name].hist(bins=n_bins)
    plt.show()

    
def display_all_numerical_hist(set1_df, set2_df, n_bins=25):
    """Display histograms for every numerical feature from set1_df and set2_df"""
    concat_df = pd.concat([set1_df, set2_df], ignore_index=True, sort=False)
    numeric_col_names = concat_df.select_dtypes(include='number').columns.values
    for col_name in numeric_col_names:
        fig, [ax_0, ax_1, ax_2] = plt.subplots(1, 3, figsize=(15, 5))
        ax_0.set_title('set1 {0}'.format(col_name))
        set1_df[col_name].hist(ax=ax_0, bins=n_bins)
        ax_1.set_title('set2 {0}'.format(col_name))
        set2_df[col_name].hist(ax=ax_1, bins=n_bins)
        ax_2.set_title('concat [set1, set2] {0}'.format(col_name))
        concat_df[col_name].hist(ax=ax_2, bins=n_bins)
        fig.tight_layout()
        plt.show()


def display_colx_coly_scatter(dataset_df, x_col_name, y_col_name, color=None):
    """Display scatterplot for {dataset_df[x_col_name], dataset_df[y_col_name]} values"""
    plt.figure(figsize=(10, 10))
    sc = plt.scatter(x_col_name, y_col_name, data=dataset_df, c=color)
    plt.title("{0} - {1}".format(x_col_name, y_col_name))
    plt.xlabel(x_col_name)
    plt.ylabel(y_col_name)
    plt.show()


def display_all_numerical_scatter(set1_df, col_to_compare):
    """Display scatter plots for every numerical feature from set1_df and col_to_compare column values"""
    numeric_col_names = set1_df.select_dtypes(include='number').columns.values
    for col_name in numeric_col_names:
        display_colx_coly_scatter(set1_df, col_name, col_to_compare, color=col_name)

In [8]:
# display_hist(train_df, 'SalePrice', 100)

# display_all_numerical_hist(
#     train_df.drop(['SalePrice', 'Id'], axis=1),
#     test_df.drop('Id', axis=1)
# )

# display_colx_coly_scatter(train_df, 'GrLivArea', 'SalePrice', color='YearBuilt')

# display_all_numerical_scatter(
#     train_df.drop(['Id'], axis=1),
#     'SalePrice'
# )

In [9]:
# Explore distributions of categorical features in certain dataset

def display_col_freqtable(dataset_df, col_name):
    """Display frequency table for dataset_df[col_name] values"""
    display(
        pd.crosstab(
            index=dataset_df[col_name],
            columns="count"
        ).sort_values(by='count', ascending=False)
    )


def display_all_categorical_freq_bar(set1_df, set2_df):
    """Display frequency table and barplot for each categorical feature"""
    concat_df = pd.concat([set1_df, set2_df], ignore_index=True, sort=False)
    numeric_col_names = concat_df.select_dtypes(include='object').columns.values
    for col_name in numeric_col_names:    
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        axes[0].set_title('set1 {0}'.format(col_name))
        set1_df[col_name].value_counts().plot(kind='bar', ax=axes[0])
        display_col_freqtable(set1_df, col_name)
        axes[1].set_title('set2 {0}'.format(col_name))
        set2_df[col_name].value_counts().plot(kind='bar', ax=axes[1])
        display_col_freqtable(set2_df, col_name)
        axes[2].set_title('concat_df [set1, set2] {0}'.format(col_name))
        concat_df[col_name].value_counts().plot(kind='bar', ax=axes[2])
        display_col_freqtable(concat_df, col_name)
        fig.tight_layout()
        plt.show()
        
    
def display_col_categorical_sns_countplot(dataset_df, col_name):
    """ Display countplot with percentage+cnt for each categorical feature"""
    fig = plt.figure(figsize=(10, 5))
    ax = sns.countplot(x=col_name, data=dataset_df)
    ax2=ax.twinx()  
    ax2.grid(None)
    ax2.get_yaxis().set_visible(False)
    ax2.get_xaxis().set_visible(False)
    for p in ax.patches:
        x = p.get_bbox().get_points()[:,0]
        y = p.get_bbox().get_points()[1,1]
        ax.annotate(
            '{} | {:.1f}%'.format(int(y), 100. * y / dataset_df[col_name].index.size),
            (x.mean(), y),
            ha='center', va='bottom'
        )
    plt.title('Distribution of {0}'.format(col_name))
    plt.xlabel('Number of {0}'.format(col_name))
    plt.show()

In [10]:
# display_all_categorical_freq_bar(train_df, test_df)

# print("HouseStyle: train_df")
# display_col_categorical_sns_countplot(train_df, 'HouseStyle')

In [11]:
# Explore NaN values

def display_nan_values(dataset_df):
    """Display amount of NaN values in dataset_df columns"""
    dataset_df_nans = dataset_df.isnull().sum()
    display(dataset_df_nans[dataset_df_nans != 0])


def display_all_nan_percentage(dataset_df):
    missing_values_cnt = dataset_df.isnull().sum()
    missing_values_pct = missing_values_cnt * 100 / len(dataset_df)
    missing_values_pct_df = pd.DataFrame({'pct_nan': missing_values_pct, 'cnt_nan': missing_values_cnt})
    missing_values_pct_df = missing_values_pct_df.sort_values('pct_nan')
    missing_values_pct_df[missing_values_pct_df['pct_nan'] != 0].plot(kind='bar')
    display(missing_values_pct_df[missing_values_pct_df['pct_nan'] != 0].T)
    plt.show()

    
def get_rows_with_nan(dataset_df, col_name, max_values=10):
    """Get rows with np.nan in col_name column values"""
    dataset_isnull_values = dataset_df.isnull()
    has_nan_rows = dataset_df.loc[
        dataset_isnull_values[dataset_isnull_values[col_name] == True].index, :
    ].head(max_values)
    return has_nan_rows

In [12]:
# display_all_nan_percentage(train_df)

# display_all_nan_percentage(test_df)

# display_all_nan_percentage(concat_train_test_df)

In [13]:
# Fix NaN values

# display_nan_values(train_df)

# display_nan_values(test_df)

In [14]:
# Intermediate arrays

train_nonan_df = train_df.copy()
test_nonan_df = test_df.copy()

In [15]:
# MSZoning

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'MSZoning'))
    display(get_rows_with_nan(test_nonan_df, 'MSZoning'))

    display_col_categorical_sns_countplot(train_nonan_df, 'MSZoning')
    display_col_categorical_sns_countplot(test_nonan_df, 'MSZoning')

    train_df_cpy = train_nonan_df.copy()
    lbl_encoder = LabelEncoder()
    train_df_cpy['MSZoning'] = lbl_encoder.fit_transform(train_df_cpy['MSZoning'])
    display_colx_coly_scatter(train_df_cpy, 'GrLivArea', 'SalePrice', color='MSZoning')

def _local_fix():
    # Fix - assume there are some "other" zoning.
    # todo: try replacing with .mode()
    # todo: features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
    test_nonan_df['MSZoning'] = test_nonan_df['MSZoning'].fillna('Other')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'MSZoning'))
    display(get_rows_with_nan(test_nonan_df, 'MSZoning'))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [16]:
# LotFrontage

def _local_disp():
    display("train", get_rows_with_nan(train_nonan_df, 'LotFrontage', 3000).shape)
    display("test", get_rows_with_nan(test_nonan_df, 'LotFrontage', 3000).shape)

    display("train", get_rows_with_nan(train_nonan_df, 'LotFrontage', 3000).head(10))
    display("test", get_rows_with_nan(test_nonan_df, 'LotFrontage', 3000).head(10))

    display_hist(train_nonan_df, 'LotFrontage', n_bins=100)
    display_hist(test_nonan_df, 'LotFrontage', n_bins=100)

    display_colx_coly_scatter(train_nonan_df, 'LotFrontage', 'LotArea')
    display_colx_coly_scatter(test_nonan_df, 'LotFrontage', 'LotArea')

    train_df_cpy = train_nonan_df.copy()
    lbl_encoder = LabelEncoder()

    train_df_cpy['MSZoning'] = lbl_encoder.fit_transform(train_df_cpy['MSZoning'])
    display_colx_coly_scatter(train_df_cpy, 'LotFrontage', 'SalePrice', color='MSZoning')
    display_colx_coly_scatter(train_df_cpy, 'LotArea', 'SalePrice', color='MSZoning') 

    train_df_cpy['Neighborhood'] = lbl_encoder.fit_transform(train_df_cpy['Neighborhood'])
    display_colx_coly_scatter(train_df_cpy, 'LotFrontage', 'SalePrice', color='Neighborhood')
    display_colx_coly_scatter(train_df_cpy, 'LotArea', 'SalePrice', color='Neighborhood') 

def _local_fix():
    # Fix - assume there might be houses without frontage at all.
    # todo: try replacing with .mean()
    # todo: replace by neighborhood / MSZoning
        # features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    train_nonan_df['LotFrontage'] = train_nonan_df['LotFrontage'].fillna(0.0)
    test_nonan_df['LotFrontage'] = test_nonan_df['LotFrontage'].fillna(0.0)

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'LotFrontage'))
    display(get_rows_with_nan(test_nonan_df, 'LotFrontage'))


# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [17]:
# Alley

def _local_disp():
    display("train", get_rows_with_nan(train_nonan_df, 'Alley', 3000).shape)
    display("test", get_rows_with_nan(test_nonan_df, 'Alley', 3000).shape)

    display(pd.unique(train_nonan_df['Alley']))

    display_col_categorical_sns_countplot(train_nonan_df, 'Alley')
    display_col_categorical_sns_countplot(test_nonan_df, 'Alley')
    
def _local_fix():
    # Fix - there are houses with no Alley access.
    # todo: try replacing with .mode() (by dataset, NOT by concatenated)

    train_nonan_df['Alley'] = train_nonan_df['Alley'].fillna('NoAccess')
    test_nonan_df['Alley'] = test_nonan_df['Alley'].fillna('NoAccess')

def _local_check():
    display("train", get_rows_with_nan(train_nonan_df, 'Alley', 3000).shape)
    display("test", get_rows_with_nan(test_nonan_df, 'Alley', 3000).shape)

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [18]:
# Utilities

def _local_disp():
    display("train", get_rows_with_nan(train_nonan_df, 'Utilities', 3000))
    display("test", get_rows_with_nan(test_nonan_df, 'Utilities', 3000))

    display("test", get_rows_with_nan(test_nonan_df, 'Utilities', 3000))

    display(pd.unique(train_nonan_df['Utilities']))
    display(pd.unique(test_nonan_df['Utilities']))

    display_col_categorical_sns_countplot(train_nonan_df, 'Utilities')
    display_col_categorical_sns_countplot(test_nonan_df, 'Utilities')

    display_colx_coly_scatter(train_nonan_df, 'Utilities', 'SalePrice')

def _local_fix():
    # Fix - there are houses with "Other" set of Utilities. "Other" might mean there are no Utilities.
    # todo: try replacing with .mode()

    train_nonan_df['Utilities'] = train_nonan_df['Utilities'].fillna('Other')
    test_nonan_df['Utilities'] = test_nonan_df['Utilities'].fillna('Other')

def _local_check():
    display("train", get_rows_with_nan(train_nonan_df, 'Utilities', 3000))
    display("test", get_rows_with_nan(test_nonan_df, 'Utilities', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [19]:
# Exterior1st and Exterior2nd 

def _local_disp():
    # missing the same row: Id=2152 in test set

    display("train", get_rows_with_nan(train_nonan_df, 'Exterior1st', 3000))
    display("test", get_rows_with_nan(test_nonan_df, 'Exterior1st', 3000))

    display("train", get_rows_with_nan(train_nonan_df, 'Exterior2nd', 3000))
    display("test", get_rows_with_nan(test_nonan_df, 'Exterior2nd', 3000))

    display(pd.unique(train_nonan_df['Exterior1st']))
    display(pd.unique(test_nonan_df['Exterior1st']))

    display(pd.unique(train_nonan_df['Exterior2nd']))
    display(pd.unique(test_nonan_df['Exterior2nd']))

    display_col_categorical_sns_countplot(train_nonan_df, 'Exterior1st')
    display_col_categorical_sns_countplot(test_nonan_df, 'Exterior1st')

    display_col_categorical_sns_countplot(train_nonan_df, 'Exterior2nd')
    display_col_categorical_sns_countplot(test_nonan_df, 'Exterior2nd')

    display_colx_coly_scatter(train_nonan_df, 'Exterior1st', 'SalePrice')
    display_colx_coly_scatter(train_nonan_df, 'Exterior2nd', 'SalePrice')

def _local_fix():
    # Fix - assume there might be no exterior at all.

    test_nonan_df['Exterior1st'] = test_nonan_df['Exterior1st'].fillna('NoExterior')

    test_nonan_df['Exterior2nd'] = test_nonan_df['Exterior2nd'].fillna('NoExterior')

def _local_check():
    display("train", get_rows_with_nan(train_nonan_df, 'Exterior1st', 3000))
    display("test", get_rows_with_nan(test_nonan_df, 'Exterior1st', 3000))

    display("train", get_rows_with_nan(train_nonan_df, 'Exterior2nd', 3000))
    display("test", get_rows_with_nan(test_nonan_df, 'Exterior2nd', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [20]:
# MasVnrType 

def _local_disp():
    display("train", get_rows_with_nan(train_nonan_df, 'MasVnrType', 3000))
    display("test", get_rows_with_nan(test_nonan_df, 'MasVnrType', 3000))

    display(pd.unique(train_nonan_df['MasVnrType']))
    display(pd.unique(test_nonan_df['MasVnrType']))

    display_col_categorical_sns_countplot(train_nonan_df, 'MasVnrType')
    display_col_categorical_sns_countplot(test_nonan_df, 'MasVnrType')

    # NOTE: some points have MasVnrType==None BUT MasVnrArea != 0
    display_col_categorical_sns_countplot(train_nonan_df[train_nonan_df['MasVnrType'] == 'None'], 'MasVnrArea')

    train_df_cpy = train_nonan_df.copy()
    lbl_encoder = LabelEncoder()

    train_df_cpy['MasVnrType'] = lbl_encoder.fit_transform(train_df_cpy['MasVnrType'].fillna('None'))
    display_colx_coly_scatter(
        train_df_cpy,
        'MasVnrArea',
        'SalePrice',
        color='MasVnrType'
    )

    train_df_cpy['FireplaceQu'] = lbl_encoder.fit_transform(train_df_cpy['FireplaceQu'].fillna('NoQual'))
    display_colx_coly_scatter(
        train_df_cpy,
        'MasVnrArea',
        'SalePrice',
        color='FireplaceQu'
    )
    
def _local_fix():
    # Assume there might be walls with some "Other" masonry veneer type.
    
    train_nonan_df['MasVnrType'] = train_nonan_df['MasVnrType'].fillna('OtherMasVnr')
    
    test_nonan_df['MasVnrType'] = test_nonan_df['MasVnrType'].fillna('OtherMasVnr')

def _local_check():
    display("train", get_rows_with_nan(train_nonan_df, 'MasVnrType', 3000))
    display("test", get_rows_with_nan(test_nonan_df, 'MasVnrType', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [21]:
# MasVnrArea

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'MasVnrArea', 3000))
    display(get_rows_with_nan(test_nonan_df, 'MasVnrArea', 3000))
    
    display_hist(train_nonan_df, 'MasVnrArea', n_bins=100)
    display_hist(test_nonan_df, 'MasVnrArea', n_bins=100)
    
    display_hist(train_nonan_df[train_nonan_df['MasVnrArea'] != 0], 'MasVnrArea', n_bins=100)
    display_hist(test_nonan_df[test_nonan_df['MasVnrArea'] != 0], 'MasVnrArea', n_bins=100)
    
    train_df_cpy = train_nonan_df.copy()
    lbl_encoder = LabelEncoder()
    train_df_cpy['MasVnrType'] = lbl_encoder.fit_transform(train_df_cpy['MasVnrType'])
    display_colx_coly_scatter(train_df_cpy, 'MasVnrArea', 'SalePrice', color='MasVnrType')

def _local_fix():
    # Assume there is no masonvry veneer, so area equals to 0.0
    train_nonan_df['MasVnrArea'] = train_nonan_df['MasVnrArea'].fillna(0.0)
    test_nonan_df['MasVnrArea'] = test_nonan_df['MasVnrArea'].fillna(0.0)

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'MasVnrArea', 3000))
    display(get_rows_with_nan(test_nonan_df, 'MasVnrArea', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [22]:
# BsmtQual 

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'BsmtQual', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtQual', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtQual', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'BsmtQual', 3000).shape)
    
    display(train_nonan_df['BsmtQual'].unique())
    display(test_nonan_df['BsmtQual'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'BsmtQual')
    display_col_categorical_sns_countplot(test_nonan_df, 'BsmtQual')
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'BsmtQual', 'SalePrice')

def _local_fix():
    # Assume there is no basement in the house
    
    train_nonan_df['BsmtQual'] = train_nonan_df['BsmtQual'].fillna('NoBsmt')
    
    test_nonan_df['BsmtQual'] = test_nonan_df['BsmtQual'].fillna('NoBsmt')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'BsmtQual', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtQual', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [23]:
# BsmtCond

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'BsmtCond', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtCond', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtCond', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'BsmtCond', 3000).shape)
    
    display(train_nonan_df['BsmtCond'].unique())
    display(test_nonan_df['BsmtCond'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'BsmtCond')
    display_col_categorical_sns_countplot(test_nonan_df, 'BsmtCond')
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'BsmtCond', 'SalePrice')

def _local_fix():
    # Assume there is no basement in the house
    
    train_nonan_df['BsmtCond'] = train_nonan_df['BsmtCond'].fillna('NoBsmt')
    
    test_nonan_df['BsmtCond'] = test_nonan_df['BsmtCond'].fillna('NoBsmt')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'BsmtCond', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtCond', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [24]:
# BsmtExposure 

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'BsmtExposure', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtExposure', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtExposure', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'BsmtExposure', 3000).shape)
    
    display(train_nonan_df['BsmtExposure'].unique())
    display(test_nonan_df['BsmtExposure'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'BsmtExposure')
    display_col_categorical_sns_countplot(test_nonan_df, 'BsmtExposure')
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'BsmtExposure', 'SalePrice')

def _local_fix():
    # Assume there is no basement at all
    
    train_nonan_df['BsmtExposure'] = train_nonan_df['BsmtExposure'].fillna('NoBsmt')
    
    test_nonan_df['BsmtExposure'] = test_nonan_df['BsmtExposure'].fillna('NoBsmt')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'BsmtExposure', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtExposure', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [25]:
# BsmtFinType1 and BsmtFinType2

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinType1', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinType1', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinType1', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinType1', 3000).shape)
    
    display(train_nonan_df['BsmtFinType1'].unique())
    display(test_nonan_df['BsmtFinType1'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'BsmtFinType1')
    display_col_categorical_sns_countplot(test_nonan_df, 'BsmtFinType1')
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'BsmtFinType1', 'SalePrice')
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinType2', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinType2', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinType2', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinType2', 3000).shape)
    
    display(train_nonan_df['BsmtFinType2'].unique())
    display(test_nonan_df['BsmtFinType2'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'BsmtFinType2')
    display_col_categorical_sns_countplot(test_nonan_df, 'BsmtFinType2')
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'BsmtFinType2', 'SalePrice')

def _local_fix():
    # Assume there is no basement at all
    
    train_nonan_df['BsmtFinType1'] = train_nonan_df['BsmtFinType1'].fillna('NoBsmt')    
    test_nonan_df['BsmtFinType1'] = test_nonan_df['BsmtFinType1'].fillna('NoBsmt')

    train_nonan_df['BsmtFinType2'] = train_nonan_df['BsmtFinType2'].fillna('NoBsmt')    
    test_nonan_df['BsmtFinType2'] = test_nonan_df['BsmtFinType2'].fillna('NoBsmt')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinType1', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinType1', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinType2', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinType2', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [26]:
# BsmtFinSF1 and BsmtFinSF2

# same idx: id=2121

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinSF1', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinSF1', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinSF1', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinSF1', 3000).shape)
    
    train_df_cpy = train_nonan_df.copy()
    lbl_encoder = LabelEncoder()
    train_df_cpy['BsmtFinType1'] = lbl_encoder.fit_transform(train_df_cpy['BsmtFinType1'])
    display_colx_coly_scatter(train_df_cpy, 'BsmtFinSF1', 'SalePrice', color='BsmtFinType1')
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinSF2', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinSF2', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinSF2', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinSF2', 3000).shape)
    
    train_df_cpy['BsmtFinType2'] = lbl_encoder.fit_transform(train_df_cpy['BsmtFinType2'])
    display_colx_coly_scatter(train_df_cpy, 'BsmtFinSF2', 'SalePrice', color='BsmtFinType2')

def _local_fix():
    # Assume there is no basement at all
    
    test_nonan_df['BsmtFinSF1'] = test_nonan_df['BsmtFinSF1'].fillna(0.0)

    test_nonan_df['BsmtFinSF2'] = test_nonan_df['BsmtFinSF2'].fillna(0.0)

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinSF1', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinSF1', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtFinSF2', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtFinSF2', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [27]:
# BsmtUnfSF and TotalBsmtSF

# same idx: id=2121

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'BsmtUnfSF', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtUnfSF', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtUnfSF', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'BsmtUnfSF', 3000).shape)
    
    train_df_cpy = train_nonan_df.copy()
    lbl_encoder = LabelEncoder()
    train_df_cpy['BsmtCond'] = lbl_encoder.fit_transform(train_df_cpy['BsmtCond'])
    display_colx_coly_scatter(train_df_cpy, 'BsmtUnfSF', 'SalePrice', color='BsmtCond')
    
    display(get_rows_with_nan(train_nonan_df, 'TotalBsmtSF', 3000))
    display(get_rows_with_nan(test_nonan_df, 'TotalBsmtSF', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'TotalBsmtSF', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'TotalBsmtSF', 3000).shape)
    
    display_colx_coly_scatter(train_df_cpy, 'TotalBsmtSF', 'SalePrice', color='BsmtCond')

def _local_fix():
    # Assume there is no basement at all
    
    test_nonan_df['BsmtUnfSF'] = test_nonan_df['BsmtUnfSF'].fillna(0.0)

    test_nonan_df['TotalBsmtSF'] = test_nonan_df['TotalBsmtSF'].fillna(0.0)

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'BsmtUnfSF', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtUnfSF', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'TotalBsmtSF', 3000))
    display(get_rows_with_nan(test_nonan_df, 'TotalBsmtSF', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [28]:
# BsmtFullBath and BsmtHalfBath

# same indices: id=2121 and id=2189

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'BsmtFullBath', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtFullBath', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtFullBath', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'BsmtFullBath', 3000).shape)
    
    display_col_categorical_sns_countplot(train_nonan_df, 'BsmtFullBath')
    display_col_categorical_sns_countplot(test_nonan_df, 'BsmtFullBath')

    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'BsmtFullBath', 'SalePrice')
        
    display(get_rows_with_nan(train_nonan_df, 'BsmtHalfBath', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtHalfBath', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtHalfBath', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'BsmtHalfBath', 3000).shape)
    
    display_col_categorical_sns_countplot(train_nonan_df, 'BsmtHalfBath')
    display_col_categorical_sns_countplot(test_nonan_df, 'BsmtHalfBath')

    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'BsmtHalfBath', 'SalePrice')
    
def _local_fix():
    # Assume there is no basement at all => there couldn't be any bath in the basement
    
    test_nonan_df['BsmtFullBath'] = test_nonan_df['BsmtFullBath'].fillna(0)

    test_nonan_df['BsmtHalfBath'] = test_nonan_df['BsmtHalfBath'].fillna(0)

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'BsmtFullBath', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtFullBath', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'BsmtHalfBath', 3000))
    display(get_rows_with_nan(test_nonan_df, 'BsmtHalfBath', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [29]:
# PoolQC

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'PoolQC', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'PoolQC', 3000).shape)
    
    display(train_nonan_df['PoolQC'].unique())
    display(test_nonan_df['PoolQC'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'PoolQC')
    display_col_categorical_sns_countplot(test_nonan_df, 'PoolQC')

    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'PoolQC', 'SalePrice')
    
def _local_fix():
    # Assume single missing row belongs to "Oth" class, which already exists in train and test sets
    
    train_nonan_df['PoolQC'] = train_nonan_df['PoolQC'].fillna('NoPool')
    
    test_nonan_df['PoolQC'] = test_nonan_df['PoolQC'].fillna('NoPool')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'PoolQC', 3000))
    display(get_rows_with_nan(test_nonan_df, 'PoolQC', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [30]:
# Fence

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'Fence', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'Fence', 3000).shape)
    
    display(train_nonan_df['Fence'].unique())
    display(test_nonan_df['Fence'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'Fence')
    display_col_categorical_sns_countplot(test_nonan_df, 'Fence')

    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'Fence', 'SalePrice')
    
    train_df_cpy = train_nonan_df.copy()
    lbl_encoder = LabelEncoder()
    train_df_cpy['Fence'] = lbl_encoder.fit_transform(train_df_cpy['Fence'])
    display_colx_coly_scatter(train_df_cpy, 'LotArea', 'SalePrice', color='Fence')
    
def _local_fix():
    # Assume single missing row belongs to "Oth" class, which already exists in train and test sets
    
    train_nonan_df['Fence'] = train_nonan_df['Fence'].fillna('NoFence')
    
    test_nonan_df['Fence'] = test_nonan_df['Fence'].fillna('NoFence')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'Fence', 3000))
    display(get_rows_with_nan(test_nonan_df, 'Fence', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [31]:
# MiscFeature

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'MiscFeature', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'MiscFeature', 3000).shape)
    
    display(train_nonan_df['MiscFeature'].unique())
    display(test_nonan_df['MiscFeature'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'MiscFeature')
    display_col_categorical_sns_countplot(test_nonan_df, 'MiscFeature')

    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'MiscFeature', 'SalePrice')
    
    display_colx_coly_scatter(
        train_nonan_df[train_nonan_df['MiscFeature'] == 'Shed'], 'MiscFeature', 'SalePrice'
    )

def _local_fix():
    # Assume single missing row belongs to "Oth" class, which already exists in train and test sets
    
    train_nonan_df['MiscFeature'] = train_nonan_df['MiscFeature'].fillna('None')
    
    test_nonan_df['MiscFeature'] = test_nonan_df['MiscFeature'].fillna('None')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'MiscFeature', 3000))
    display(get_rows_with_nan(test_nonan_df, 'MiscFeature', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [32]:
# SaleType 

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'SaleType', 3000))
    display(get_rows_with_nan(test_nonan_df, 'SaleType', 3000))
    
    display(train_nonan_df['SaleType'].unique())
    display(test_nonan_df['SaleType'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'SaleType')
    display_col_categorical_sns_countplot(test_nonan_df, 'SaleType')
    
    train_df_cpy = train_nonan_df.copy()
    lbl_encoder = LabelEncoder()
    train_df_cpy['SaleCondition'] = lbl_encoder.fit_transform(train_df_cpy['SaleCondition'])
    display_colx_coly_scatter(train_df_cpy, 'SaleType', 'SalePrice', color='SaleCondition')

def _local_fix():
    # Assume single missing row belongs to "Oth" class, which already exists in train and test sets
    
    train_nonan_df['SaleType'] = train_nonan_df['SaleType'].fillna('Oth')
    
    test_nonan_df['SaleType'] = test_nonan_df['SaleType'].fillna('Oth')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'SaleType', 3000))
    display(get_rows_with_nan(test_nonan_df, 'SaleType', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [33]:
# FireplaceQu

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'FireplaceQu', 3000).head(3))
    display(get_rows_with_nan(test_nonan_df, 'FireplaceQu', 3000).head(3))

    display(get_rows_with_nan(train_nonan_df, 'FireplaceQu', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'FireplaceQu', 3000).shape)
    
    display(train_nonan_df['FireplaceQu'].unique())
    display(test_nonan_df['FireplaceQu'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'FireplaceQu')
    display_col_categorical_sns_countplot(test_nonan_df, 'FireplaceQu')
    
    train_fireplaces_nan = get_rows_with_nan(train_nonan_df, 'FireplaceQu', 3000)
    display(train_fireplaces_nan[train_fireplaces_nan['Fireplaces'] != 0])  # empty df
    
    test_fireplaces_nan = get_rows_with_nan(test_nonan_df, 'FireplaceQu', 3000)
    display(test_fireplaces_nan[test_fireplaces_nan['Fireplaces'] != 0])  # empty df
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'SaleType', 'SalePrice')

def _local_fix():
    # Assume there is no fireplace at all (because "Fireplaces" = 0)
    
    train_nonan_df['FireplaceQu'] = train_nonan_df['FireplaceQu'].fillna('None')
    
    test_nonan_df['FireplaceQu'] = test_nonan_df['FireplaceQu'].fillna('None')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'FireplaceQu', 3000))
    display(get_rows_with_nan(test_nonan_df, 'FireplaceQu', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [34]:
# Other features with NaN values: Electrical, KitchenQual, Functional

# Electrical: train_nonan_df, row id=1380
# KitchenQual: test_nonan_df, row_id=1556
# Functional: test_nona_df, row_indices=2217,2474.

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'Electrical', 3000))
    display(get_rows_with_nan(test_nonan_df, 'Electrical', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'KitchenQual', 3000))
    display(get_rows_with_nan(test_nonan_df, 'KitchenQual', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'Functional', 3000))
    display(get_rows_with_nan(test_nonan_df, 'Functional', 3000))
    
def _local_fix():
    # Electrical: replace with the most common value
    train_nonan_df['Electrical'] = train_nonan_df['Electrical'].fillna(
        train_nonan_df['Electrical'].mode()[0]
    )
    # KitchenQual: replace with the most common value
    test_nonan_df['KitchenQual'] = test_nonan_df['KitchenQual'].fillna(
        test_nonan_df['KitchenQual'].mode()[0]
    )
    # Functional: from docs: "Assume typical unless deductions are warranted"
    test_nonan_df['Functional'] = test_nonan_df['Functional'].fillna('Typ')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'Electrical', 3000))
    display(get_rows_with_nan(test_nonan_df, 'Electrical', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'KitchenQual', 3000))
    display(get_rows_with_nan(test_nonan_df, 'KitchenQual', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'Functional', 3000))
    display(get_rows_with_nan(test_nonan_df, 'Functional', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [35]:
# GarageType

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'GarageType', 3000))
    display(get_rows_with_nan(test_nonan_df, 'GarageType', 3000))

    display(get_rows_with_nan(train_nonan_df, 'GarageType', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'GarageType', 3000).shape)
    
    display(train_nonan_df['GarageType'].unique())
    display(test_nonan_df['GarageType'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'GarageType')
    display_col_categorical_sns_countplot(test_nonan_df, 'GarageType')
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'GarageType', 'SalePrice')

def _local_fix():
    # Assume there is no garage (because all GarageArea==0.0)
    
    train_nonan_df['GarageType'] = train_nonan_df['GarageType'].fillna('NoGarage')
    
    test_nonan_df['GarageType'] = test_nonan_df['GarageType'].fillna('NoGarage')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'GarageType', 3000))
    display(get_rows_with_nan(test_nonan_df, 'GarageType', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [36]:
# GarageYrBlt

def _local_disp():
#     display(get_rows_with_nan(train_nonan_df, 'GarageYrBlt', 3000))
#     display(get_rows_with_nan(test_nonan_df, 'GarageYrBlt', 3000))

    display(get_rows_with_nan(train_nonan_df, 'GarageYrBlt', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'GarageYrBlt', 3000).shape)
    
    train_nan_garageyrblt = get_rows_with_nan(train_nonan_df, 'GarageYrBlt', 3000)
    display(train_nan_garageyrblt[train_nan_garageyrblt['GarageType'] == 'Detchd'])  # empty
    
    test_nan_garageyrblt = get_rows_with_nan(test_nonan_df, 'GarageYrBlt', 3000)
    display(test_nan_garageyrblt[test_nan_garageyrblt['GarageType'] == 'Detchd'])  # indices=[2127,2577]
    
    display(train_nonan_df['GarageYrBlt'].min(), train_nonan_df['GarageYrBlt'].max())
    display(test_nonan_df['GarageYrBlt'].min(), test_nonan_df['GarageYrBlt'].max())
    
    display_hist(train_nonan_df, 'GarageYrBlt', n_bins=100)
    display_hist(test_nonan_df, 'GarageYrBlt', n_bins=100)
    
    display_colx_coly_scatter(train_nonan_df, 'GarageYrBlt', 'SalePrice')
    
def _local_fix():
    # For indices=[2127,2577]: because they are detached -> replace by median value
    dtchd_garage_yrblt_median = test_nonan_df.groupby('GarageType').get_group('Detchd')['GarageYrBlt'].median()
    test_nonan_df.loc[666, 'GarageYrBlt'] = dtchd_garage_yrblt_median
    test_nonan_df.loc[1116, 'GarageYrBlt'] = dtchd_garage_yrblt_median
    
    # Assume all other rows with NaN in GarageYrBlt mean that there is no garage at all    
    # Because GarageYrBlt is a numerical feature, replace it with a really early year - 1500 - "Magic year".
    # This feature will be "cut" later so no worry for such an inadequate value.
    
    train_nonan_df['GarageYrBlt'] = train_nonan_df['GarageYrBlt'].fillna(1500)
    
    test_nonan_df['GarageYrBlt'] = test_nonan_df['GarageYrBlt'].fillna(1500)

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'GarageYrBlt', 3000))
    display(get_rows_with_nan(test_nonan_df, 'GarageYrBlt', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [37]:
# GarageFinish

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'GarageFinish', 3000))
    display(get_rows_with_nan(test_nonan_df, 'GarageFinish', 3000))

    display(get_rows_with_nan(train_nonan_df, 'GarageFinish', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'GarageFinish', 3000).shape)
    
    display(train_nonan_df['GarageFinish'].unique())
    display(test_nonan_df['GarageFinish'].unique())
    
    train_nan_garageyrblt = get_rows_with_nan(train_nonan_df, 'GarageFinish', 3000)
    display(train_nan_garageyrblt[train_nan_garageyrblt['GarageType'] == 'Detchd'])  # empty
    
    test_nan_garageyrblt = get_rows_with_nan(test_nonan_df, 'GarageFinish', 3000)
    display(test_nan_garageyrblt[test_nan_garageyrblt['GarageType'] == 'Detchd'])  # indices=[2127,2577]
    
    display_col_categorical_sns_countplot(train_nonan_df, 'GarageFinish')
    display_col_categorical_sns_countplot(test_nonan_df, 'GarageFinish')
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'GarageFinish', 'SalePrice')

def _local_fix():
    # For indices=[2127,2577]: because they are detached -> replace by median value
    dtchd_garage_garagefinish_mode = test_nonan_df.groupby('GarageType'
                                                          ).get_group('Detchd')['GarageFinish'].mode()[0]
    test_nonan_df.loc[666, 'GarageFinish'] = dtchd_garage_garagefinish_mode
    test_nonan_df.loc[1116, 'GarageFinish'] = dtchd_garage_garagefinish_mode

    # Assume all other rows with NaN in GarageFinish mean that there is no garage at all
    # Another reason: in these rows GarageArea == 0 => there is no garage at all
        
    train_nonan_df['GarageFinish'] = train_nonan_df['GarageFinish'].fillna('NoGarage')
    
    test_nonan_df['GarageFinish'] = test_nonan_df['GarageFinish'].fillna('NoGarage')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'GarageFinish', 3000))
    display(get_rows_with_nan(test_nonan_df, 'GarageFinish', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [38]:
# GarageCars and GarageArea

# 1 row: id=2577

def _local_disp():
    display(get_rows_with_nan(train_nonan_df, 'GarageCars', 3000))
    display(get_rows_with_nan(test_nonan_df, 'GarageCars', 3000))

    display(get_rows_with_nan(train_nonan_df, 'GarageCars', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'GarageCars', 3000).shape)
    
    display(train_nonan_df['GarageCars'].unique())
    display(test_nonan_df['GarageCars'].unique())
    
    display_col_categorical_sns_countplot(train_nonan_df, 'GarageCars')
    display_col_categorical_sns_countplot(test_nonan_df, 'GarageCars')
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'GarageCars', 'SalePrice')
    
    display(get_rows_with_nan(train_nonan_df, 'GarageArea', 3000))
    display(get_rows_with_nan(test_nonan_df, 'GarageArea', 3000))

    display(get_rows_with_nan(train_nonan_df, 'GarageArea', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'GarageArea', 3000).shape)
    
    display_hist(train_nonan_df, 'GarageArea', n_bins=100)
    display_hist(test_nonan_df, 'GarageArea', n_bins=100)
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'GarageArea', 'SalePrice', color='GarageCars')

def _local_fix():
    # GarageCars
    # For idx=[2577]: because garagetype is detached -> replace by mode value
    test_nonan_df.loc[1116, 'GarageCars'
                     ] = test_nonan_df.groupby('GarageType').get_group('Detchd')['GarageCars'].median()

    # GarageArea
    # For idx=[2577]: because garagetype is detached -> replace by mode value
    test_nonan_df.loc[1116, 'GarageArea'
                     ] = test_nonan_df.groupby('GarageType').get_group('Detchd')['GarageArea'].mean()

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'GarageCars', 3000))
    display(get_rows_with_nan(test_nonan_df, 'GarageCars', 3000))
    
    display(get_rows_with_nan(train_nonan_df, 'GarageArea', 3000))
    display(get_rows_with_nan(test_nonan_df, 'GarageArea', 3000))
    
# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [39]:
# GarageQual

def _local_disp():
#     display(get_rows_with_nan(train_nonan_df, 'GarageQual', 3000))
#     display(get_rows_with_nan(test_nonan_df, 'GarageQual', 3000))

    display(get_rows_with_nan(train_nonan_df, 'GarageQual', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'GarageQual', 3000).shape)
    
    display(train_nonan_df['GarageQual'].unique())
    display(test_nonan_df['GarageQual'].unique())
    
    train_nan_garageyrblt = get_rows_with_nan(train_nonan_df, 'GarageQual', 3000)
    display(train_nan_garageyrblt[train_nan_garageyrblt['GarageType'] == 'Detchd'])  # empty
    
    test_nan_garageyrblt = get_rows_with_nan(test_nonan_df, 'GarageQual', 3000)
    display(test_nan_garageyrblt[test_nan_garageyrblt['GarageType'] == 'Detchd'])  # indices=[2127,2577]
    
    display_col_categorical_sns_countplot(train_nonan_df, 'GarageQual')
    display_col_categorical_sns_countplot(test_nonan_df, 'GarageQual')
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'GarageQual', 'SalePrice')

def _local_fix():
    # For indices=[2127,2577]: because they are detached -> replace by median value
    dtchd_garage_garagefinish_mode = test_nonan_df.groupby('GarageType'
                                                          ).get_group('Detchd')['GarageQual'].mode()[0]
    test_nonan_df.loc[666, 'GarageQual'] = dtchd_garage_garagefinish_mode
    test_nonan_df.loc[1116, 'GarageQual'] = dtchd_garage_garagefinish_mode

    # For every other garagetype=np.nan: assume there is no garage at all
    
    train_nonan_df['GarageQual'] = train_nonan_df['GarageQual'].fillna('NoGarage')
    
    test_nonan_df['GarageQual'] = test_nonan_df['GarageQual'].fillna('NoGarage')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'GarageQual', 3000))
    display(get_rows_with_nan(test_nonan_df, 'GarageQual', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [40]:
# GarageCond

def _local_disp():
#     display(get_rows_with_nan(train_nonan_df, 'GarageQual', 3000))
#     display(get_rows_with_nan(test_nonan_df, 'GarageQual', 3000))

    display(get_rows_with_nan(train_nonan_df, 'GarageCond', 3000).shape)
    display(get_rows_with_nan(test_nonan_df, 'GarageCond', 3000).shape)
    
    display(train_nonan_df['GarageCond'].unique())
    display(test_nonan_df['GarageCond'].unique())
    
    train_nan_garageyrblt = get_rows_with_nan(train_nonan_df, 'GarageCond', 3000)
    display(train_nan_garageyrblt[train_nan_garageyrblt['GarageType'] == 'Detchd'])  # empty
    
    test_nan_garageyrblt = get_rows_with_nan(test_nonan_df, 'GarageCond', 3000)
    display(test_nan_garageyrblt[test_nan_garageyrblt['GarageType'] == 'Detchd'])  # indices=[2127,2577]
    
    display_col_categorical_sns_countplot(train_nonan_df, 'GarageCond')
    display_col_categorical_sns_countplot(test_nonan_df, 'GarageCond')
    
    display_colx_coly_scatter(train_nonan_df.fillna('dbg'), 'GarageCond', 'SalePrice')

def _local_fix():
    # For indices=[2127,2577]: because they are detached -> replace by median value
    dtchd_garage_garagefinish_mode = test_nonan_df.groupby('GarageType'
                                                          ).get_group('Detchd')['GarageCond'].mode()[0]
    test_nonan_df.loc[666, 'GarageCond'] = dtchd_garage_garagefinish_mode
    test_nonan_df.loc[1116, 'GarageCond'] = dtchd_garage_garagefinish_mode

    # For every other GarageCond=np.nan: assume there is no garage at all
    
    train_nonan_df['GarageCond'] = train_nonan_df['GarageCond'].fillna('NoGarage')
    
    test_nonan_df['GarageCond'] = test_nonan_df['GarageCond'].fillna('NoGarage')

def _local_check():
    display(get_rows_with_nan(train_nonan_df, 'GarageCond', 3000))
    display(get_rows_with_nan(test_nonan_df, 'GarageCond', 3000))

# Explore
# _local_disp()

# Fix
_local_fix()

# Check
# _local_check()

In [41]:
# Check again fixed NaN values

display_nan_values(train_nonan_df)

display_nan_values(test_nonan_df)

Series([], dtype: int64)

Series([], dtype: int64)

In [42]:
# Functions to fix column dtypes

# Divide continuous data into n_bins bins.
def continuous_to_bins_inplace(dataset_df, col_name, n_bins):
    qcut_bins = pd.qcut(dataset_df[col_name], n_bins, retbins=True)[1]
    qcut_bins[0] = int(qcut_bins[0]) - 1
    qcut_bins[-1] = int(qcut_bins[-1]) + 2
    column_copy = dataset_df[col_name].copy()
    for idx in range(len(qcut_bins) - 1):
        cur_range_start = qcut_bins[idx]
        cur_range_end = qcut_bins[idx + 1]
        after_start_mask = column_copy >= cur_range_start 
        before_end_mask = column_copy < cur_range_end
        dataset_df.loc[
            after_start_mask & before_end_mask, col_name
        ] = "{0}_{1}".format(cur_range_start, cur_range_end)

In [43]:
# Intermediate arrays

train_fixdtypes_df = train_nonan_df.copy()
test_fixdtypes_df = test_nonan_df.copy()

In [44]:
# display_dataset_col_dtypes(train_nonan_df)

In [45]:
# Fix several features from numerical to categorical dtype

num2cat_col_names = [
    'MSSubClass',
    'OverallQual', 'OverallCond',
    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
    'Fireplaces',
    'GarageCars',
    'MoSold', 'YrSold'
]

for col_name in num2cat_col_names:
    train_fixdtypes_df[col_name] = train_fixdtypes_df[col_name].astype(str)
    test_fixdtypes_df[col_name] = test_fixdtypes_df[col_name].astype(str)
    
# Cut several features into different chunks

# YearRemodAdd
continuous_to_bins_inplace(train_fixdtypes_df, 'YearRemodAdd', 4)
continuous_to_bins_inplace(test_fixdtypes_df, 'YearRemodAdd', 4)

# YearBuilt
continuous_to_bins_inplace(train_fixdtypes_df, 'YearBuilt', 4)
continuous_to_bins_inplace(test_fixdtypes_df, 'YearBuilt', 4)

# GarageYrBlt
def _ugly_fix_garageyrblt_categories_inplace(dataset_df):
    garageyrblt_cpy = dataset_df['GarageYrBlt']
    dataset_df.loc[(garageyrblt_cpy == 1500), 'GarageYrBlt'] = "0"
    dataset_df.loc[(garageyrblt_cpy >= 1899.0 - 1) & (garageyrblt_cpy < 1961.0), 'GarageYrBlt'] = "1"
    dataset_df.loc[(garageyrblt_cpy >= 1961.0 - 1) & (garageyrblt_cpy < 1980.0), 'GarageYrBlt'] = "2"
    dataset_df.loc[(garageyrblt_cpy >= 1980.0 - 1) & (garageyrblt_cpy < 2002.0), 'GarageYrBlt'] = "3"
    dataset_df.loc[(garageyrblt_cpy >= 2002.0 - 1) & (garageyrblt_cpy < 2012.0 + 2), 'GarageYrBlt'] = "4"
_ugly_fix_garageyrblt_categories_inplace(train_fixdtypes_df)
_ugly_fix_garageyrblt_categories_inplace(test_fixdtypes_df)

In [46]:
# display_dataset_col_dtypes(train_fixdtypes_df)

In [None]:
# Functions to try out models



In [None]:
# Try out models for data "without nan; fixed dtypes"

In [None]:
# Divide concat_train_test_df into train and test parts

X_train_full = concat_train_test_df[:train_df.shape[0]].drop('Id', axis=1)

X_train = X_train_full.drop('SalePrice', axis=1)
y_train = X_train_full['SalePrice']

X_test = concat_train_test_df[train_df.shape[0]:].drop(['SalePrice', 'Id'], axis=1)

In [None]:
# Encode categorical values with LabelEncoder

def encode_column_inplace(dataset_df, col_name):
    lbl_encoder = LabelEncoder()
    dataset_df[col_name] = lbl_encoder.fit_transform(dataset_df[col_name].values)
    
categorical_columns = concat_train_test_df.select_dtypes(include='object').columns.values    

for col_name in categorical_columns:
    encode_column_inplace(X_train, col_name)
    encode_column_inplace(X_test, col_name)

In [None]:
# Try out different regressors for raw data (fixed np.nan and encoded labels)

import warnings
warnings.filterwarnings('ignore')

n_folds = KFold(n_splits=10)

models = [
    LassoCV(), RidgeCV(), LinearRegression(), ElasticNetCV(cv=n_folds), BayesianRidge(),
    KNeighborsRegressor(), 
    RandomForestRegressor(), GradientBoostingRegressor(),
    XGBRegressor(objective='reg:squarederror'),
    SVR(kernel='rbf', gamma=0.1)
]

print("score by cross-validation, k=10, train:")
for model in models:
    print(cross_val_score(model, X_train, y_train, cv=n_folds).mean())
    
pre_X_train, pre_X_test, pre_y_train, pre_y_test = train_test_split(
    X_train, y_train, random_state=42
)

print("r2:")
for model in models:
    model.fit(pre_X_train, pre_y_train)
    pre_y_pred = model.predict(pre_X_test)
    print(r2_score(pre_y_test, pre_y_pred))
    
print("rmsle:")
for model in models:
    model.fit(pre_X_train, pre_y_train)
    pre_y_pred = model.predict(pre_X_test)
    print(np.sqrt(mean_squared_log_error(pre_y_test, pre_y_pred)))

In [None]:
# Create pairplot for continuous data

continous_columns = concat_train_test_df.select_dtypes(include='number').columns.values
display(continous_columns)

# sns.pairplot(concat_train_test_df.loc[:, continous_columns])

In [None]:
# Removing outliers

# Use sns.pairplot saved image to get depdencies for SalePrice for continuous data

X_train_full_nooutliers = X_train.copy()
X_train_full_nooutliers['SalePrice'] = y_train

display("Before removing outliers, shape", X_train_full_nooutliers.shape)

X_train_full_nooutliers = X_train_full_nooutliers.drop(
    X_train_full_nooutliers[
        (X_train_full_nooutliers['GrLivArea'] > 4000) & (X_train_full_nooutliers['SalePrice'] < 200000)
    ].index
)

X_train_full_nooutliers = X_train_full_nooutliers.drop(
    X_train_full_nooutliers[X_train_full_nooutliers['LotFrontage'] > 250].index
)

X_train_full_nooutliers = X_train_full_nooutliers.drop(
    X_train_full_nooutliers[X_train_full_nooutliers['LotArea'] > 50000].index
)

X_train_full_nooutliers = X_train_full_nooutliers.drop(
    X_train_full_nooutliers[X_train_full_nooutliers['MasVnrArea'] > 1200].index
)

display_col_saleprice_scplot(X_train_full_nooutliers, 'BsmtFinSF2')
X_train_full_nooutliers = X_train_full_nooutliers.drop(
    X_train_full_nooutliers[X_train_full_nooutliers['BsmtFinSF2'] > 1400].index
)
X_train_full_nooutliers = X_train_full_nooutliers.drop(
    X_train_full_nooutliers[
        (X_train_full_nooutliers['BsmtFinSF2'] > 150) & 
        (X_train_full_nooutliers['BsmtFinSF2'] < 600) &
        (X_train_full_nooutliers['SalePrice'] > 350000)
    ].index
)
display_col_saleprice_scplot(X_train_full_nooutliers, 'BsmtFinSF2')


X_train_full_nooutliers = X_train_full_nooutliers.drop(
    X_train_full_nooutliers[X_train_full_nooutliers['TotalBsmtSF'] > 3000].index
)

X_train_full_nooutliers = X_train_full_nooutliers.drop(
    X_train_full_nooutliers[
        (X_train_full_nooutliers['GarageArea'] > 1200) & 
        (X_train_full_nooutliers['SalePrice'] < 300000)
    ].index
)

X_train_full_nooutliers = X_train_full_nooutliers.drop(
    X_train_full_nooutliers[X_train_full_nooutliers['OpenPorchSF'] > 400].index
)

X_train_full_nooutliers = X_train_full_nooutliers.drop(
    X_train_full_nooutliers[X_train_full_nooutliers['EnclosedPorch'] > 400].index
)

display("After removing outliers, shape", X_train_full_nooutliers.shape)

In [None]:
# Try same initial models with removed outliers

X_train_nooutliers = X_train_full_nooutliers.drop('SalePrice', axis=1)
y_train_nooutliers = X_train_full_nooutliers['SalePrice']

import warnings
warnings.filterwarnings('ignore')

n_folds = KFold(n_splits=10)

models = [
    LassoCV(), RidgeCV(), LinearRegression(), ElasticNetCV(cv=n_folds), BayesianRidge(),
    KNeighborsRegressor(), 
    RandomForestRegressor(), GradientBoostingRegressor(),
    XGBRegressor(objective='reg:squarederror'),
    SVR(kernel='rbf', gamma=0.1)
]

print("score by cross-validation, k=10, train:")
for model in models:
    print(cross_val_score(model, X_train_nooutliers, y_train_nooutliers, cv=n_folds).mean())
    
outl_X_train, outl_X_test, outl_y_train, outl_y_test = train_test_split(
    X_train_nooutliers, y_train_nooutliers, random_state=42
)

print("r2:")
for model in models:
    model.fit(outl_X_train, outl_y_train)
    outl_y_pred = model.predict(outl_X_test)
    print(r2_score(outl_y_test, outl_y_pred))
    
print("rmsle:")
for model in models:
    model.fit(outl_X_train, outl_y_train)
    outl_y_pred = model.predict(outl_X_test)
    print(np.sqrt(mean_squared_log_error(outl_y_test, outl_y_pred)))

In [None]:
# Skew continuous data to normal distribution

def display_df_numerical_before_after_log(dataset_df, col_name):
    fig, [ax_0, ax_1, ax_2, ax_3] = plt.subplots(1, 4, figsize=(15, 5))
    sns.distplot(dataset_df[col_name], ax=ax_0)
    sns.distplot(np.log(dataset_df[col_name]), ax=ax_1)
    stats.probplot(dataset_df[col_name], plot=ax_2)
    stats.probplot(np.log(dataset_df[col_name]), plot=ax_3)
    plt.show()
    
fig, [ax_0, ax_1, ax_2, ax_3] = plt.subplots(1, 4, figsize=(15, 5))
sns.distplot(y_train_nooutliers, ax=ax_0)
sns.distplot(np.log(y_train_nooutliers), ax=ax_1)
stats.probplot(y_train_nooutliers, plot=ax_2)
stats.probplot(np.log(y_train_nooutliers), plot=ax_3)

plt.show()

numerical_columns_to_skew = ['LotArea', '1stFlrSF', 'GrLivArea']

for col_name in numerical_columns_to_skew:
    display_df_numerical_before_after_log(X_train_full_nooutliers, col_name)

In [None]:
# Fix skewiness

X_train_noskew = X_train_nooutliers.copy()
for col_name in numerical_columns_to_skew:
    X_train_noskew[col_name] = np.log(X_train_noskew[col_name])
y_train_noskew = np.log(y_train_nooutliers)

X_test_noskew = X_test.copy()
for col_name in numerical_columns_to_skew:
    X_test_noskew[col_name] = np.log(X_test_noskew[col_name])

In [None]:
# Explore skewed data

# Distplots: before-after
sns.pairplot(X_train_nooutliers.loc[:, numerical_columns_to_skew]); plt.show()
sns.pairplot(X_train_noskew.loc[:, numerical_columns_to_skew]); plt.show()

# Pearson correlation: before-after
sns.heatmap(X_train_nooutliers.loc[:, numerical_columns_to_skew].corr(), annot=True); plt.show()
sns.heatmap(X_train_noskew.loc[:, numerical_columns_to_skew].corr(), annot=True); plt.show()

In [None]:
# Try initial models with fixed skewed data and removed outliers

import warnings
warnings.filterwarnings('ignore')

n_folds = KFold(n_splits=10)

models = [
    LassoCV(), RidgeCV(), LinearRegression(), ElasticNetCV(cv=n_folds), BayesianRidge(),
    KNeighborsRegressor(), 
    RandomForestRegressor(), GradientBoostingRegressor(),
    XGBRegressor(objective='reg:squarederror'),
    SVR(kernel='rbf', gamma=0.1)
]

print("score by cross-validation, k=10, train:")
for model in models:
    print(cross_val_score(model, X_train_noskew, y_train_noskew, cv=n_folds).mean())
    
nosk_X_train, nosk_X_test, nosk_y_train, nosk_y_test = train_test_split(
    X_train_noskew, y_train_noskew, random_state=42
)

print("r2:")
for model in models:
    model.fit(nosk_X_train, nosk_y_train)
    nosk_y_pred = model.predict(nosk_X_test)
    print(r2_score(nosk_y_test, nosk_y_pred))
    
print("rmsle:")
for model in models:
    model.fit(nosk_X_train, nosk_y_train)
    nosk_y_pred = model.predict(nosk_X_test)
    print(np.sqrt(mean_squared_error(nosk_y_test, nosk_y_pred)))

In [None]:
# Feature engineering

concat_train_test_noskew_df = pd.concat([X_train_noskew, X_test_noskew], ignore_index=True, sort=False)

continuous_to_bins_inplace(concat_train_test_noskew_df, 'BsmtFinSF1', 10)

concat_train_test_noskew_df['BsmtFinSF2_Flag'] = concat_train_test_noskew_df['BsmtFinSF2'].map(
    lambda x: 0 if x==0 else 1
)
concat_train_test_noskew_df = concat_train_test_noskew_df.drop('BsmtFinSF2', axis=1)

continuous_to_bins_inplace(concat_train_test_noskew_df, 'BsmtUnfSF', 20)

continuous_to_bins_inplace(concat_train_test_noskew_df, 'TotalBsmtSF', 20)

continuous_to_bins_inplace(concat_train_test_noskew_df, '1stFlrSF', 20)

continuous_to_bins_inplace(concat_train_test_noskew_df, '2ndFlrSF', 20)

concat_train_test_noskew_df['LowQualFinSF_Flag'] = concat_train_test_noskew_df['LowQualFinSF'].map(
    lambda x: 0 if x==0 else 1
)
concat_train_test_noskew_df = concat_train_test_noskew_df.drop('LowQualFinSF', axis=1)

concat_train_test_noskew_df['TotalBathrooms'] = \
    concat_train_test_noskew_df['HalfBath'] + concat_train_test_noskew_df['FullBath'] + \
    concat_train_test_noskew_df['BsmtHalfBath'] + concat_train_test_noskew_df['BsmtFullBath']
bathroom_columns = ['HalfBath', 'FullBath', 'BsmtHalfBath', 'BsmtFullBath']
concat_train_test_noskew_df = concat_train_test_noskew_df.drop(bathroom_columns, axis=1)

continuous_to_bins_inplace(concat_train_test_noskew_df, 'GrLivArea', 20)

concat_train_test_noskew_df['MasVnrArea_Flag'] = concat_train_test_noskew_df['MasVnrArea'].map(
    lambda x: 0 if x==0 else 1
)
concat_train_test_noskew_df = concat_train_test_noskew_df.drop('MasVnrArea', axis=1)

continuous_to_bins_inplace(concat_train_test_noskew_df, 'GarageArea', 10)

In [None]:
# Try initial models after feature engineering and fixing skewing steps

X_train_feateng = concat_train_test_noskew_df[:X_train_noskew.shape[0]]
y_train_feateng = y_train_noskew.copy()
X_test_feateng = concat_train_test_noskew_df[X_train_noskew.shape[0]:]

# Prepare test sets for noskew and featureeng concat dataset
noskew_new_categorical_columns = concat_train_test_noskew_df.select_dtypes(include='object').columns.values

for col_name in noskew_new_categorical_columns:
    encode_column_inplace(X_train_feateng, col_name)
    encode_column_inplace(X_test_feateng, col_name)
    
import warnings
warnings.filterwarnings('ignore')

n_folds = KFold(n_splits=10)

models = [
    LassoCV(), RidgeCV(), LinearRegression(), ElasticNetCV(cv=n_folds), BayesianRidge(),
    KNeighborsRegressor(), 
    RandomForestRegressor(), GradientBoostingRegressor(),
    XGBRegressor(objective='reg:squarederror'),
    SVR(kernel='rbf', gamma=0.1)
]

print("score by cross-validation, k=10, train:")
for model in models:
    print(cross_val_score(model, X_train_feateng, y_train_feateng, cv=n_folds).mean())
    
feateng_X_train, feateng_X_test, feateng_y_train, feateng_y_test = train_test_split(
    X_train_feateng, y_train_feateng, random_state=42
)

print("r2:")
for model in models:
    model.fit(feateng_X_train, feateng_y_train)
    feateng_y_pred = model.predict(feateng_X_test)
    print(r2_score(feateng_y_test, feateng_y_pred))
    
print("rmsle:")
for model in models:
    model.fit(feateng_X_train, feateng_y_train)
    feateng_y_pred = model.predict(feateng_X_test)
    print(np.sqrt(mean_squared_error(feateng_y_test, feateng_y_pred)))

In [None]:
# Correlation analysis

X_train_feateng_full = X_train_feateng.copy()
X_train_feateng_full['SalePrice'] = y_train_feateng

# Pearson correlation for numerical data
display(X_train_feateng_full.corr().abs().loc[:, 'SalePrice'].sort_values())

# features_to_remove = X_train_feateng_full.corr().abs().loc[:, 'SalePrice'].sort_values()[:20].index
# display(features_to_remove)

In [None]:
# Feature importances: RandomForestRegressor

model_rfr = RandomForestRegressor()
model_rfr.fit(X_train_feateng, y_train_feateng)

display(model_rfr.feature_importances_)

importances_df = pd.DataFrame({'importance': model_rfr.feature_importances_})
importances_df['feature'] = X_train_feateng.columns
importances_df.sort_values(by='importance', ascending=False, inplace=True)
importances_df = importances_df.set_index('feature', drop=True)
importances_df.plot.barh()
plt.show()

importances_df_top = importances_df.iloc[:20, :]
importances_df_top.plot.barh()
plt.show()

In [None]:
# Feature importances: XGBRegressor

model_xgbr = XGBRegressor()
model_xgbr.fit(X_train_feateng, y_train_feateng)

importances_df = pd.DataFrame({'importance': model_xgbr.feature_importances_})
importances_df['feature'] = X_train_feateng.columns
importances_df.sort_values(by='importance', ascending=False, inplace=True)
importances_df = importances_df.set_index('feature', drop=True)
importances_df.plot.barh()
plt.show()

importances_df_top = importances_df.iloc[:20, :]
importances_df_top.plot.barh()
plt.show()

display(importances_df_top.index.values)

In [None]:
# Try removing features with default models

features_to_drop = importances_df.iloc[50:, :].index.values

X_train_remfeat = X_train_feateng.drop(features_to_drop, axis=1)
X_test_remfeat = X_test_feateng.drop(features_to_drop, axis=1)
y_train_remfeat = y_train_feateng.copy()

import warnings
warnings.filterwarnings('ignore')

n_folds = KFold(n_splits=10)

models = [
    LassoCV(), RidgeCV(), LinearRegression(), ElasticNetCV(cv=n_folds), BayesianRidge(),
    KNeighborsRegressor(), 
    RandomForestRegressor(), GradientBoostingRegressor(),
    XGBRegressor(objective='reg:squarederror'),
    SVR(kernel='rbf', gamma=0.1)
]

print("score by cross-validation, k=10, train:")
for model in models:
    print(cross_val_score(model, X_train_remfeat, y_train_remfeat, cv=n_folds).mean())
    
remfeat_X_train, remfeat_X_test, remfeat_y_train, remfeat_y_test = train_test_split(
    X_train_remfeat, y_train_remfeat, random_state=42
)

print("r2:")
for model in models:
    model.fit(remfeat_X_train, remfeat_y_train)
    remfeat_y_pred = model.predict(remfeat_X_test)
    print(r2_score(remfeat_y_test, remfeat_y_pred))
    
print("rmsle:")
for model in models:
    model.fit(remfeat_X_train, remfeat_y_train)
    remfeat_y_pred = model.predict(remfeat_X_test)
    print(np.sqrt(mean_squared_error(remfeat_y_test, remfeat_y_pred)))

In [None]:
# Explore final datasets again

X_train_feateng_cpy = X_train_feateng.copy()

continuous_to_bins_inplace(X_train_feateng_cpy, 'LotArea', 20)

encode_column_inplace(X_train_feateng_cpy, 'LotArea', 10)
encode_column_inplace(X_test_feateng, 'LotArea', 10)

display(X_test_feateng.head(10))

# _tmp_X_train, _tmp_X_test, _tmp_y_train, _tmp_y_test = train_test_split(
#     X_train_feateng_cpy, y_train_feateng, random_state=42
# )

# model = GradientBoostingRegressor(
#     n_estimators=2000, max_features='sqrt', max_depth=100, min_samples_leaf = 4
# )
# _tmp_y_pred = model.predict(X_test_feateng)

# print("cross validation k=10")
# print(cross_val_score(model, X_train_feateng_cpy, y_train_feateng, cv=n_folds).mean())

# print("r2:")


In [None]:
# Hyperparameters search : without features removal

models = [
    LassoCV(), RidgeCV(), LinearRegression(), ElasticNetCV(cv=n_folds), BayesianRidge(),
    KNeighborsRegressor(), 
    RandomForestRegressor(), GradientBoostingRegressor(),
    XGBRegressor(objective='reg:squarederror'),
    SVR(kernel='rbf', gamma=0.1)
]

models_params = [
    {
#         'alphas': [0.001, 0.01, 0.02, 0.025, 0.05, 0.1, 0.5, 1, 5],
        'n_alphas': [100, 1000, 10000],
        'eps': [1e-6, 1e-5, 1e-4, 1e-3, 1, 10],
        'max_iter': [1000, 10000, 50000]
    },  # {'eps': 1e-05, 'max_iter': 1000, 'n_alphas': 1000}
    {
        'alpha':[200, 230, 250,265, 270, 275, 290, 300, 500]
    },
    {
        'n_neighbors': [3, 5, 10, 20],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    {
        'bootstrap': [True, False],
        'n_estimators': [10, 100, 500, 700, 1000, 1500, 2000],
        'max_depth': [None, 1, 3, 5, 10, 20, 30, 40, 50, 75, 100],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 3, 5, 10]
    }
]

# lasso_params
# ridge_params
# lr_params
# elasticnet_params
# b_ridge_params


gs = GridSearchCV(
    RandomForestRegressor(),
    models_params[3],
    verbose=2,
    cv=n_folds,
    n_jobs=-1
)
gs_results = gs.fit(X_train_feateng, y_train_feateng)

display(
    gs_results.best_score_,
    gs_results.best_estimator_,
    gs_results.best_params_
)

# knn: radial + display circles
# svc: rbf kernel, linear kernel

In [None]:
# Hyperparameters search : with features removal

In [None]:
# KFold and LeaveOneOut CV

In [None]:
model = GradientBoostingRegressor()
model.fit(X_train_remfeat, y_train_remfeat)
y_pred_log = model.predict(X_test_remfeat)

y_pred = np.exp(y_pred_log)

display(y_pred)

In [None]:
# Predictions submission

submissions_df = pd.DataFrame()
submissions_df['Id'] = test_df['Id']
submissions_df['SalePrice'] = y_pred 

submissions_df.to_csv('submission_4.csv', index=False)