### Mohammed Furqan: 
##### March 2020

I would like to thank these guys for their amazing kernels. They are the ones who did the critical part of feature engineering, tuning parameters with clear explanations for beginners to learn:

1. [A study on Regression applied to the Ames dataset](https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset) by **Julien Cohen-Solal**
2. [Stacked Regressions](https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard) by **Serigne**
3. [XGBoost + Lasso](https://www.kaggle.com/humananalog/xgboost-lasso) by **Human Analog**
4. [Regularised Linear Models](https://www.kaggle.com/apapiu/regularized-linear-models) by **Alexandru Papiu**

And many others. These guys have done a good job in explaining and documenting the codes quite well!

## 1. Import libraries and data




In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.special import boxcox1p
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, make_scorer

%matplotlib inline
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns

color = sns.color_palette()
sns.set_style('darkgrid')

# Ignore warnings
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

# For some statistics
from scipy import stats
from scipy.stats import norm, skew 

#Limiting floats output to 3 decimal points
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

# Import data
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

# Drop ID column
train_ID = train['Id']
test_ID = test['Id']
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

## 2. Data Processing



In [None]:
# Drop outliers
train.drop(train[train["GrLivArea"] > 4000].index, inplace=True)

test.loc[1116, "GarageType"] = np.nan

# Normalise the target variable since it's skewed distribution
train["SalePrice"] = np.log1p(train["SalePrice"])
y_train = train.SalePrice.values

# Merge all data for further processing. Drop the target variables
all_data = pd.concat((train, test)).reset_index(drop=True).drop(['SalePrice'], axis=1)

### 2.1 Missing Values

After identifying the columns which have the highest percentage of missing values, we proceed with imputing them.

In [None]:
def fillNA(data, var, fill=None, custom=False):
    if not fill:
        data_map = {
            "float64": 0,
            "int64": 0,
            "object": "None"
        }
        data[var] = data[var].fillna(data_map[str(data[var].dtype)])
    else:
        if custom:
            data[var] = data[var].fillna(fill)
        elif fill == 'mode':
            # If mode values are to be imputed
            data[var] = data[var].fillna(data[var].mode()[0])
    return data
    
    
impute_vals = ["BedroomAbvGr", "Alley", "GarageYrBlt", "GarageArea", "GarageCars", 'BsmtFinSF1', 'PoolArea', 'MSSubClass',
               'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'HalfBath', 'KitchenAbvGr', 'WoodDeckSF',
               'BsmtHalfBath', 'MasVnrType', 'MasVnrArea', 'MiscVal', 'OpenPorchSF', 'ScreenPorch', 'TotRmsAbvGrd', "EnclosedPorch"]

# Impute NAs with default '0' or 'None' depending on the feature's data type
for var in impute_vals:
    all_data = fillNA(all_data, var)
    
# Impute the NAs with respective mode values
impute_mode = ['MSZoning', 'Electrical', 'Exterior1st', 'Exterior2nd', 'SaleType']
for var in impute_mode:
    all_data = fillNA(all_data, var, fill='mode')
    
# Impute with custom default values
impute_custom = {
    "CentralAir": "N",
    "Condition1": "Norm",
    "Condition1": "Norm",
    "Functional": "Typ",
    "HeatingQC" : "TA",
    "LotShape"  : "Reg",
    "PavedDrive": "N",
    "PoolQC"    : "No",
    "BsmtQual"  : "No",
    "BsmtCond"  : "No",
    "BsmtExposure"  : "No",
    "BsmtFinType1"  : "No",
    "BsmtFinType2"  : "No",
    "ExterCond"  : "No",
    "ExterQual"  : "No",
    "Fence"  : "No",
    "FireplaceQu"  : "No",
    "GarageType"  : "No",
    "GarageFinish"  : "No",
    "GarageQual"  : "No",
    "GarageCond"  : "No",
    "KitchenQual"  : "TA",
    "MiscFeature"  : "No",
    "PavedDrive" : "N",
    "SaleCondition": "Normal"
}

for var in impute_custom:
    all_data = fillNA(all_data, var, impute_custom[var], custom=True)
    
    
# Some Unique cases
all_data["Functional"] = all_data["Functional"].fillna("Typ") # As per the data description NA means 'typical'
all_data = all_data.drop(['Utilities'], axis=1) # Since all values are same
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median())) # LotFrontage by neighborhood

### 2.2 Feature Engineering

In [None]:
# Convert some of the numeric values to strings as they are in-face categories
for var in ['MSSubClass', 'YrSold', 'OverallCond', 'MoSold']:
    all_data[var] = all_data[var].astype(str)
    
# Encode some categorical features as ordered numbers when there is information in the order
all_data = all_data.replace({"Alley" : {"No" : 0, "Grvl" : 1, "Pave" : 2},
                       "BsmtCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                       "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, "ALQ" : 5, "GLQ" : 6},
                       "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, "ALQ" : 5, "GLQ" : 6},
                       "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                       "ExterCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "ExterQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Functional" : {"No" : 0, "Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, "Min2" : 6, "Min1" : 7, "Typ" : 8},
                       "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "HeatingQC" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "KitchenQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                       "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                       "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                       "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "Street" : {"Grvl" : 1, "Pave" : 2},
                       "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}}
                     )


# Create new features
# 1* Simplifications of existing features
all_data["SimplOverallQual"] = all_data.OverallQual.replace({1 : 1, 2 : 1, 3 : 1,  4 : 2, 5 : 2, 6 : 2,7 : 3, 8 : 3, 9 : 3, 10 : 3})
all_data["SimplOverallCond"] = all_data.OverallCond.replace({1 : 1, 2 : 1, 3 : 1,  4 : 2, 5 : 2, 6 : 2,7 : 3, 8 : 3, 9 : 3, 10 : 3})
all_data["SimplPoolQC"] = all_data.PoolQC.replace({1 : 1, 2 : 1,3 : 2, 4 : 2})
all_data["SimplGarageCond"] = all_data.GarageCond.replace({1 : 1, 4 : 2, 5 : 2})
all_data["SimplGarageQual"] = all_data.GarageQual.replace({1 : 1, 4 : 2, 5 : 2})
all_data["SimplFireplaceQu"] = all_data.FireplaceQu.replace({1 : 1, 4 : 2, 5 : 2})
all_data["SimplFireplaceQu"] = all_data.FireplaceQu.replace({1 : 1, 4 : 2, 5 : 2})
all_data["SimplFunctional"] = all_data.Functional.replace({1 : 1, 2 : 1,3 : 2, 4 : 2,5 : 3, 6 : 3, 7 : 3, 8 : 4})
all_data["SimplKitchenQual"] = all_data.KitchenQual.replace({1 : 1, 4 : 2, 5 : 2})
all_data["SimplHeatingQC"] = all_data.HeatingQC.replace({1 : 1,4 : 2, 5 : 2})
all_data["SimplBsmtFinType1"] = all_data.BsmtFinType1.replace({1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2, 6 : 2 })
all_data["SimplBsmtFinType2"] = all_data.BsmtFinType2.replace({1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2, 6 : 2})
all_data["SimplBsmtCond"] = all_data.BsmtCond.replace({1 : 1, 4 : 2, 5 : 2})
all_data["SimplBsmtQual"] = all_data.BsmtQual.replace({1 : 1,4 : 2, 5 : 2})
all_data["SimplExterCond"] = all_data.ExterCond.replace({1 : 1,4 : 2, 5 : 2})
all_data["SimplExterQual"] = all_data.ExterQual.replace({1 : 1,4 : 2, 5 : 2})

# 2* Combinations of existing features
all_data["OverallGrade"] = all_data["OverallQual"] * all_data["OverallCond"]
all_data["GarageGrade"] = all_data["GarageQual"] * all_data["GarageCond"]
all_data["ExterGrade"] = all_data["ExterQual"] * all_data["ExterCond"]
all_data["KitchenScore"] = all_data["KitchenAbvGr"] * all_data["KitchenQual"]
all_data["FireplaceScore"] = all_data["Fireplaces"] * all_data["FireplaceQu"]
all_data["GarageScore"] = all_data["GarageArea"] * all_data["GarageQual"]
all_data["PoolScore"] = all_data["PoolArea"] * all_data["PoolQC"]
all_data["SimplOverallGrade"] = all_data["SimplOverallQual"] * all_data["SimplOverallCond"]
all_data["SimplExterGrade"] = all_data["SimplExterQual"] * all_data["SimplExterCond"]
all_data["SimplPoolScore"] = all_data["PoolArea"] * all_data["SimplPoolQC"]
all_data["SimplGarageScore"] = all_data["GarageArea"] * all_data["SimplGarageQual"]
all_data["SimplFireplaceScore"] = all_data["Fireplaces"] * all_data["SimplFireplaceQu"]
all_data["SimplKitchenScore"] = all_data["KitchenAbvGr"] * all_data["SimplKitchenQual"]
all_data["TotalBath"] = all_data["BsmtFullBath"] + (0.5 * all_data["BsmtHalfBath"]) + \
all_data["FullBath"] + (0.5 * all_data["HalfBath"])
all_data["AllSF"] = all_data["GrLivArea"] + all_data["TotalBsmtSF"]
all_data["AllFlrsSF"] = all_data["1stFlrSF"] + all_data["2ndFlrSF"]
all_data["AllPorchSF"] = all_data["OpenPorchSF"] + all_data["EnclosedPorch"] + \
all_data["3SsnPorch"] + all_data["ScreenPorch"]
all_data["HasMasVnr"] = all_data.MasVnrType.replace({"BrkCmn" : 1, "BrkFace" : 1, "CBlock" : 1, "Stone" : 1, "None" : 0})
all_data["BoughtOffPlan"] = all_data.SaleCondition.replace({"Abnorml" : 0, "Alloca" : 0, "AdjLand" : 0, "Family" : 0, "Normal" : 0, "Partial" : 1})

# 3* Polynomials on the top 10 existing features
all_data["OverallQual-s2"] = all_data["OverallQual"] ** 2
all_data["OverallQual-s3"] = all_data["OverallQual"] ** 3
all_data["OverallQual-Sq"] = np.sqrt(all_data["OverallQual"])
all_data["AllSF-2"] = all_data["AllSF"] ** 2
all_data["AllSF-3"] = all_data["AllSF"] ** 3
all_data["AllSF-Sq"] = np.sqrt(all_data["AllSF"])
all_data["AllFlrsSF-2"] = all_data["AllFlrsSF"] ** 2
all_data["AllFlrsSF-3"] = all_data["AllFlrsSF"] ** 3
all_data["AllFlrsSF-Sq"] = np.sqrt(all_data["AllFlrsSF"])
all_data["GrLivArea-2"] = all_data["GrLivArea"] ** 2
all_data["GrLivArea-3"] = all_data["GrLivArea"] ** 3
all_data["GrLivArea-Sq"] = np.sqrt(all_data["GrLivArea"])
all_data["SimplOverallQual-s2"] = all_data["SimplOverallQual"] ** 2
all_data["SimplOverallQual-s3"] = all_data["SimplOverallQual"] ** 3
all_data["SimplOverallQual-Sq"] = np.sqrt(all_data["SimplOverallQual"])
all_data["ExterQual-2"] = all_data["ExterQual"] ** 2
all_data["ExterQual-3"] = all_data["ExterQual"] ** 3
all_data["ExterQual-Sq"] = np.sqrt(all_data["ExterQual"])
all_data["GarageCars-2"] = all_data["GarageCars"] ** 2
all_data["GarageCars-3"] = all_data["GarageCars"] ** 3
all_data["GarageCars-Sq"] = np.sqrt(all_data["GarageCars"])
all_data["TotalBath-2"] = all_data["TotalBath"] ** 2
all_data["TotalBath-3"] = all_data["TotalBath"] ** 3
all_data["TotalBath-Sq"] = np.sqrt(all_data["TotalBath"])
all_data["KitchenQual-2"] = all_data["KitchenQual"] ** 2
all_data["KitchenQual-3"] = all_data["KitchenQual"] ** 3
all_data["KitchenQual-Sq"] = np.sqrt(all_data["KitchenQual"])
all_data["GarageScore-2"] = all_data["GarageScore"] ** 2
all_data["GarageScore-3"] = all_data["GarageScore"] ** 3
all_data["GarageScore-Sq"] = np.sqrt(all_data["GarageScore"])

# Adding total sqfootage feature 
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

# Normalise skewed numeric features
categorical_features = all_data.select_dtypes(include = ["object"]).columns
numerical_features = all_data.select_dtypes(exclude = ["object"]).columns
train_num = all_data[numerical_features]
train_cat = all_data[categorical_features]

skewness = train_num.apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 0.75]
skewed_features = skewness.index
train_num[skewed_features] = boxcox1p(train_num[skewed_features], 0.15)

# Get one-hot representation for categorical variables
train_cat = pd.get_dummies(train_cat)

all_data = pd.concat([train_num, train_cat], axis = 1)

train = all_data[:train.shape[0]]
test = all_data[train.shape[0]:]

# Standardize numerical features
scale = StandardScaler()
train.loc[:, numerical_features] = scale.fit_transform(train.loc[:, numerical_features])
test.loc[:, numerical_features] = scale.transform(test.loc[:, numerical_features])

drop_cols = ["Exterior1st_ImStucc", "Exterior1st_Stone", "Exterior2nd_Other","HouseStyle_2.5Fin", 
             "RoofMatl_Membran", "RoofMatl_Metal", "RoofMatl_Roll",
             "Condition2_RRAe", "Condition2_RRAn", "Condition2_RRNn",
             "Heating_Floor", "Heating_OthW", "MSSubClass_150", 
             "Electrical_Mix", "MiscFeature_TenC", "Condition2_PosN", "MSZoning_C (all)", "MSSubClass_160"
            ]
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

print(train.shape, test.shape)

## 3. Modelling

### 3.1 Base Models

In [None]:
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    print(train.shape, y_train.shape)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)


lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
model_xgb = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0, 
                             learning_rate=0.01, max_depth=4, 
                             min_child_weight=1.5, n_estimators=7200,
                             reg_alpha=0.9, reg_lambda=0.6,
                             subsample=0.2, silent=1,
                             seed = 25)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

### 3.2 Stacking Models

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)
    
averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR, model_xgb, model_lgb), meta_model = lasso)
score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

averaged_models.fit(train.values, y_train)
stacked_pred = np.expm1(averaged_models.predict(test.values))

# Save the predictions
final_prediction = pd.DataFrame({"Id": test_ID, "SalePrice": stacked_pred})
final_prediction.to_csv('submission.csv',index=False)