In [2]:
import pandas as pd
import numpy as np
import gc
import sys
from sys import stdout
#sys.path.append(path+'/src/python/')
from time import time

from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.gridspec as gridspec
import matplotlib as mpl
import seaborn as sns
from seaborn.linearmodels import *

import xgboost as xgb

from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred)) 

path = './data/'
train_file = path + 'train.csv'
test_file = path + 'test.csv'



In [3]:
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file) 

In [4]:
train_df.drop(train_df[train_df["GrLivArea"] > 4000].index, inplace=True)

test_df.loc[666, "GarageQual"] = "TA"
test_df.loc[666, "GarageCond"] = "TA"
test_df.loc[666, "GarageFinish"] = "Unf"
test_df.loc[666, "GarageYrBlt"] = "1980"

test_df.loc[1116, "GarageType"] = np.nan

lot_frontage_by_neighborhood = train_df["LotFrontage"].groupby(train_df["Neighborhood"])

# Used to convert categorical features into ordinal numbers.
# (There's probably an easier way to do this, but it works.)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

def factorize(df, factor_df, column, fill_na=None):
    factor_df[column] = df[column]
    if fill_na is not None:
        factor_df[column].fillna(fill_na, inplace=True)
    le.fit(factor_df[column].unique())
    factor_df[column] = le.transform(factor_df[column])
    return factor_df

In [5]:
# Combine all the (numerical) features into one big DataFrame. We don't add 
# the one-hot encoded variables here yet, that happens later on.
def munge(df):
    all_df = pd.DataFrame(index = df.index)
   
    all_df["LotFrontage"] = df["LotFrontage"]   
    for key, group in lot_frontage_by_neighborhood:
        idx = (df["Neighborhood"] == key) & (df["LotFrontage"].isnull())
        all_df.loc[idx, "LotFrontage"] = group.median()    

    all_df["LotArea"] = df["LotArea"]

    all_df["MasVnrArea"] = df["MasVnrArea"]
    all_df["MasVnrArea"].fillna(0, inplace=True)
   
    all_df["BsmtFinSF1"] = df["BsmtFinSF1"]
    all_df["BsmtFinSF1"].fillna(0, inplace=True)

    all_df["BsmtFinSF2"] = df["BsmtFinSF2"]
    all_df["BsmtFinSF2"].fillna(0, inplace=True)

    all_df["BsmtUnfSF"] = df["BsmtUnfSF"]
    all_df["BsmtUnfSF"].fillna(0, inplace=True)

    all_df["TotalBsmtSF"] = df["TotalBsmtSF"]
    all_df["TotalBsmtSF"].fillna(0, inplace=True)

    all_df["1stFlrSF"] = df["1stFlrSF"]
    all_df["2ndFlrSF"] = df["2ndFlrSF"]
    all_df["GrLivArea"] = df["GrLivArea"]
    
    all_df["GarageArea"] = df["GarageArea"]
    all_df["GarageArea"].fillna(0, inplace=True)

    all_df["WoodDeckSF"] = df["WoodDeckSF"]
    all_df["OpenPorchSF"] = df["OpenPorchSF"]
    all_df["EnclosedPorch"] = df["EnclosedPorch"]
    all_df["3SsnPorch"] = df["3SsnPorch"]
    all_df["ScreenPorch"] = df["ScreenPorch"]
    
    all_df["BsmtFullBath"] = df["BsmtFullBath"]
    all_df["BsmtFullBath"].fillna(0, inplace=True)

    all_df["BsmtHalfBath"] = df["BsmtHalfBath"]
    all_df["BsmtHalfBath"].fillna(0, inplace=True)

    all_df["FullBath"] = df["FullBath"] 
    all_df["HalfBath"] = df["HalfBath"] 
    all_df["BedroomAbvGr"] = df["BedroomAbvGr"] 
    all_df["KitchenAbvGr"] = df["KitchenAbvGr"] 
    all_df["TotRmsAbvGrd"] = df["TotRmsAbvGrd"] 
    all_df["Fireplaces"] = df["Fireplaces"] 

    all_df["GarageCars"] = df["GarageCars"]
    all_df["GarageCars"].fillna(0, inplace=True)

    all_df["CentralAir"] = (df["CentralAir"] == "Y") * 1.0
   
    all_df["OverallQual"] = df["OverallQual"]
    all_df["OverallCond"] = df["OverallCond"]

    # Quality measurements are stored as text but we can convert them to 
    # numbers where a higher number means higher quality.

    qual_dict = {None: 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
    all_df["ExterQual"] = df["ExterQual"].map(qual_dict).astype(int)
    all_df["ExterCond"] = df["ExterCond"].map(qual_dict).astype(int)
    all_df["BsmtQual"] = df["BsmtQual"].map(qual_dict).astype(int)
    all_df["BsmtCond"] = df["BsmtCond"].map(qual_dict).astype(int)
    all_df["HeatingQC"] = df["HeatingQC"].map(qual_dict).astype(int)
    all_df["KitchenQual"] = df["KitchenQual"].map(qual_dict).astype(int)
    all_df["FireplaceQu"] = df["FireplaceQu"].map(qual_dict).astype(int)
    all_df["GarageQual"] = df["GarageQual"].map(qual_dict).astype(int)
    all_df["GarageCond"] = df["GarageCond"].map(qual_dict).astype(int)

    all_df["BsmtExposure"] = df["BsmtExposure"].map(
        {None: 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4}).astype(int)

    bsmt_fin_dict = {None: 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
    all_df["BsmtFinType1"] = df["BsmtFinType1"].map(bsmt_fin_dict).astype(int)
    all_df["BsmtFinType2"] = df["BsmtFinType2"].map(bsmt_fin_dict).astype(int)

    all_df["Functional"] = df["Functional"].map(
        {None: 0, "Sal": 1, "Sev": 2, "Maj2": 3, "Maj1": 4, 
         "Mod": 5, "Min2": 6, "Min1": 7, "Typ": 8}).astype(int)

    all_df["GarageFinish"] = df["GarageFinish"].map(
        {None: 0, "Unf": 1, "RFn": 2, "Fin": 3}).astype(int)

    all_df["Fence"] = df["Fence"].map(
        {None: 0, "MnWw": 1, "GdWo": 2, "MnPrv": 3, "GdPrv": 4}).astype(int)

    all_df["YearBuilt"] = df["YearBuilt"]
    all_df["YearRemodAdd"] = df["YearRemodAdd"]

    all_df["GarageYrBlt"] = df["GarageYrBlt"]
    all_df["GarageYrBlt"].fillna(0.0, inplace=True)

    all_df["MoSold"] = df["MoSold"]
    all_df["YrSold"] = df["YrSold"]
    
    all_df["LowQualFinSF"] = df["LowQualFinSF"]
    all_df["MiscVal"] = df["MiscVal"]

    all_df["PoolQC"] = df["PoolQC"].map(qual_dict).astype(int)

    all_df["PoolArea"] = df["PoolArea"]
    all_df["PoolArea"].fillna(0, inplace=True)
    
    # Add categorical features as numbers too. It seems to help a bit.
    all_df = factorize(df, all_df, "MSSubClass")
    all_df = factorize(df, all_df, "MSZoning", "RL")
    all_df = factorize(df, all_df, "LotConfig")
    all_df = factorize(df, all_df, "Neighborhood")
    all_df = factorize(df, all_df, "Condition1")
    all_df = factorize(df, all_df, "BldgType")
    all_df = factorize(df, all_df, "HouseStyle")
    all_df = factorize(df, all_df, "RoofStyle")
    all_df = factorize(df, all_df, "Exterior1st", "Other")
    all_df = factorize(df, all_df, "Exterior2nd", "Other")
    all_df = factorize(df, all_df, "MasVnrType", "None")
    all_df = factorize(df, all_df, "Foundation")
    all_df = factorize(df, all_df, "SaleType", "Oth")
    all_df = factorize(df, all_df, "SaleCondition")

    # IR2 and IR3 don't appear that often, so just make a distinction
    # between regular and irregular.
    all_df["IsRegularLotShape"] = (df["LotShape"] == "Reg") * 1

    # Most properties are level; bin the other possibilities together
    # as "not level".
    all_df["IsLandLevel"] = (df["LandContour"] == "Lvl") * 1

    # Most land slopes are gentle; treat the others as "not gentle".
    all_df["IsLandSlopeGentle"] = (df["LandSlope"] == "Gtl") * 1

    # Most properties use standard circuit breakers.
    all_df["IsElectricalSBrkr"] = (df["Electrical"] == "SBrkr") * 1

    # About 2/3rd have an attached garage.
    all_df["IsGarageDetached"] = (df["GarageType"] == "Detchd") * 1

    # Most have a paved drive. Treat dirt/gravel and partial pavement
    # as "not paved".
    all_df["IsPavedDrive"] = (df["PavedDrive"] == "Y") * 1

    # The only interesting "misc. feature" is the presence of a shed.
    all_df["HasShed"] = (df["MiscFeature"] == "Shed") * 1.  

    # If YearRemodAdd != YearBuilt, then a remodeling took place at some point.
    all_df["Remodeled"] = (all_df["YearRemodAdd"] != all_df["YearBuilt"]) * 1
    
    # Did a remodeling happen in the year the house was sold?
    all_df["RecentRemodel"] = (all_df["YearRemodAdd"] == all_df["YrSold"]) * 1
    
    # Was this house sold in the year it was built?
    all_df["VeryNewHouse"] = (all_df["YearBuilt"] == all_df["YrSold"]) * 1

    all_df["Has2ndFloor"] = (all_df["2ndFlrSF"] == 0) * 1
    all_df["HasMasVnr"] = (all_df["MasVnrArea"] == 0) * 1
    all_df["HasWoodDeck"] = (all_df["WoodDeckSF"] == 0) * 1
    all_df["HasOpenPorch"] = (all_df["OpenPorchSF"] == 0) * 1
    all_df["HasEnclosedPorch"] = (all_df["EnclosedPorch"] == 0) * 1
    all_df["Has3SsnPorch"] = (all_df["3SsnPorch"] == 0) * 1
    all_df["HasScreenPorch"] = (all_df["ScreenPorch"] == 0) * 1

    # These features actually lower the score a little.
    # all_df["HasBasement"] = df["BsmtQual"].isnull() * 1
    # all_df["HasGarage"] = df["GarageQual"].isnull() * 1
    # all_df["HasFireplace"] = df["FireplaceQu"].isnull() * 1
    # all_df["HasFence"] = df["Fence"].isnull() * 1

    # Months with the largest number of deals may be significant.
    all_df["HighSeason"] = df["MoSold"].replace( 
        {1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 7: 1, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0})

    all_df["NewerDwelling"] = df["MSSubClass"].replace(
        {20: 1, 30: 0, 40: 0, 45: 0,50: 0, 60: 1, 70: 0, 75: 0, 80: 0, 85: 0,
         90: 0, 120: 1, 150: 0, 160: 0, 180: 0, 190: 0})   
    
    all_df.loc[df.Neighborhood == 'NridgHt', "Neighborhood_Good"] = 1
    all_df.loc[df.Neighborhood == 'Crawfor', "Neighborhood_Good"] = 1
    all_df.loc[df.Neighborhood == 'StoneBr', "Neighborhood_Good"] = 1
    all_df.loc[df.Neighborhood == 'Somerst', "Neighborhood_Good"] = 1
    all_df.loc[df.Neighborhood == 'NoRidge', "Neighborhood_Good"] = 1
    all_df["Neighborhood_Good"].fillna(0, inplace=True)

    all_df["SaleCondition_PriceDown"] = df.SaleCondition.replace(
        {'Abnorml': 1, 'Alloca': 1, 'AdjLand': 1, 'Family': 1, 'Normal': 0, 'Partial': 0})

    # House completed before sale or not
    all_df["BoughtOffPlan"] = df.SaleCondition.replace(
        {"Abnorml" : 0, "Alloca" : 0, "AdjLand" : 0, "Family" : 0, "Normal" : 0, "Partial" : 1})
    
    all_df["BadHeating"] = df.HeatingQC.replace(
        {'Ex': 0, 'Gd': 0, 'TA': 0, 'Fa': 1, 'Po': 1})

    area_cols = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 
                 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'LowQualFinSF', 'PoolArea' ]
    all_df["TotalArea"] = all_df[area_cols].sum(axis=1)

    all_df["TotalArea1st2nd"] = all_df["1stFlrSF"] + all_df["2ndFlrSF"]

    all_df["Age"] = 2010 - all_df["YearBuilt"]
    all_df["TimeSinceSold"] = 2010 - all_df["YrSold"]

    all_df["SeasonSold"] = all_df["MoSold"].map({12:0, 1:0, 2:0, 3:1, 4:1, 5:1, 
                                                  6:2, 7:2, 8:2, 9:3, 10:3, 11:3}).astype(int)
    
    all_df["YearsSinceRemodel"] = all_df["YrSold"] - all_df["YearRemodAdd"]
    
    # Simplifications of existing features into bad/average/good.
    all_df["SimplOverallQual"] = all_df.OverallQual.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2, 6 : 2, 7 : 3, 8 : 3, 9 : 3, 10 : 3})
    all_df["SimplOverallCond"] = all_df.OverallCond.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2, 6 : 2, 7 : 3, 8 : 3, 9 : 3, 10 : 3})
    all_df["SimplPoolQC"] = all_df.PoolQC.replace(
        {1 : 1, 2 : 1, 3 : 2, 4 : 2})
    all_df["SimplGarageCond"] = all_df.GarageCond.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    all_df["SimplGarageQual"] = all_df.GarageQual.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    all_df["SimplFireplaceQu"] = all_df.FireplaceQu.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    all_df["SimplFireplaceQu"] = all_df.FireplaceQu.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    all_df["SimplFunctional"] = all_df.Functional.replace(
        {1 : 1, 2 : 1, 3 : 2, 4 : 2, 5 : 3, 6 : 3, 7 : 3, 8 : 4})
    all_df["SimplKitchenQual"] = all_df.KitchenQual.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    all_df["SimplHeatingQC"] = all_df.HeatingQC.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    all_df["SimplBsmtFinType1"] = all_df.BsmtFinType1.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2, 6 : 2})
    all_df["SimplBsmtFinType2"] = all_df.BsmtFinType2.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2, 6 : 2})
    all_df["SimplBsmtCond"] = all_df.BsmtCond.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    all_df["SimplBsmtQual"] = all_df.BsmtQual.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    all_df["SimplExterCond"] = all_df.ExterCond.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    all_df["SimplExterQual"] = all_df.ExterQual.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
            
    # Bin by neighborhood (a little arbitrarily). Values were computed by: 
    # train_df["SalePrice"].groupby(train_df["Neighborhood"]).median().sort_values()
    neighborhood_map = {
        "MeadowV" : 0,  #  88000
        "IDOTRR" : 1,   # 103000
        "BrDale" : 1,   # 106000
        "OldTown" : 1,  # 119000
        "Edwards" : 1,  # 119500
        "BrkSide" : 1,  # 124300
        "Sawyer" : 1,   # 135000
        "Blueste" : 1,  # 137500
        "SWISU" : 2,    # 139500
        "NAmes" : 2,    # 140000
        "NPkVill" : 2,  # 146000
        "Mitchel" : 2,  # 153500
        "SawyerW" : 2,  # 179900
        "Gilbert" : 2,  # 181000
        "NWAmes" : 2,   # 182900
        "Blmngtn" : 2,  # 191000
        "CollgCr" : 2,  # 197200
        "ClearCr" : 3,  # 200250
        "Crawfor" : 3,  # 200624
        "Veenker" : 3,  # 218000
        "Somerst" : 3,  # 225500
        "Timber" : 3,   # 228475
        "StoneBr" : 4,  # 278000
        "NoRidge" : 4,  # 290000
        "NridgHt" : 4,  # 315000
    }

    all_df["NeighborhoodBin"] = df["Neighborhood"].map(neighborhood_map)
    return all_df

train_df_munged = munge(train_df)
test_df_munged = munge(test_df)

print(train_df_munged.shape)
print(test_df_munged.shape)

(1456, 111)
(1459, 111)


In [6]:
# Copy NeighborhoodBin into a temporary DataFrame because we want to use the
# unscaled version later on (to one-hot encode it). 
neighborhood_bin_train = pd.DataFrame(index = train_df.index)
neighborhood_bin_train["NeighborhoodBin"] = train_df_munged["NeighborhoodBin"]
neighborhood_bin_test = pd.DataFrame(index = test_df.index)
neighborhood_bin_test["NeighborhoodBin"] = test_df_munged["NeighborhoodBin"]

################################################################################

numeric_features = train_df_munged.dtypes[train_df_munged.dtypes != "object"].index

################################################################################

In [7]:
# Transform the skewed features by taking log(feature + 1).
# This will make the features more normal.
from scipy.stats import skew

skewed = train_df_munged.apply(lambda x: skew(x.dropna().astype(float)))
skewed = skewed[skewed > 0.75]
skewed = skewed.index

train_df_munged[skewed] = np.log1p(train_df_munged[skewed])
test_df_munged[skewed] = np.log1p(test_df_munged[skewed])

################################################################################

# We take the log here because the error metric is between the log of the
# SalePrice and the log of the predicted price. That does mean we need to 
# exp() the prediction to get an actual sale price.
label_df = pd.DataFrame(index = train_df_munged.index, columns=["SalePrice"])
label_df["SalePrice"] = np.log(train_df["SalePrice"])

print("Training set size:", train_df_munged.shape)
print("Test set size:", test_df_munged.shape)

Training set size: (1456, 111)
Test set size: (1459, 111)


In [8]:
categorical_features = train_df.select_dtypes(include = ["object"]).columns
numerical_features = train_df.select_dtypes(exclude = ["object"]).columns
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))

Numerical features : 38
Categorical features : 43


In [9]:
# Convert Categorical to Numerical using Ridge Regression
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Ridge

def catridge(cat2num_train_df,cat2num_test_df,column):
    
    global train_df, test_df
    
    ridge_fold = Ridge(fit_intercept=True)

    state = 1
    skf = StratifiedKFold(n_splits=5, shuffle=False, random_state=state)
    #train_cat_dummies['CatRidge'] = pd.Series(np.nan())

    fold = 0
    
    X_train = pd.get_dummies(train_df[[column]])
    X_test = pd.get_dummies(test_df[[column]])
    feat_intersect = list(set.intersection(set(X_train.columns.values),set(X_test.columns.values)))
    X_train = X_train[feat_intersect]
    X_test  = X_test[feat_intersect]
    y_train = label_df['SalePrice']

    ## Methodology
    # For training set, the model fit for the other K-1 partitions will be used for prediction
    # For test set, the mean of the predictions for the K models
    CatRidge_train = np.zeros(y_train.shape[0])
    CatRidge_test = np.zeros((X_test.shape[0],skf.get_n_splits()))

    for train_index, test_index in skf.split(X_train,y_train):
        X_train_fold  = X_train.iloc[train_index,:]
        X_test_fold   = X_test.iloc[test_index,:]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        ridge_fold.fit(X_train_fold, y_train_fold)
        CatRidge_train[test_index] = ridge_fold.predict(X_test_fold)
        CatRidge_test[:,fold] = ridge_fold.predict(X_test)
        fold = fold + 1
        #print("Fold = %d, Score = %f"% (fold, ridge_fold.score(X_test_fold,y_test_fold)))
    
    cat2num_train_df[column+'_ridge'] = pd.Series(CatRidge_train)
    cat2num_test_df[column+'_ridge'] = np.mean(CatRidge_test,axis=1)
    
    
    if skew(CatRidge_train) > 0.75:
        cat2num_train_df[column+'_ridge'] = np.log1p(cat2num_train_df[column+'_ridge'])
        
    if skew(cat2num_test_df[column+'_ridge']) > 0.75:
        cat2num_train_df[column+'_ridge'] = np.log1p(cat2num_train_df[column+'_ridge'])
    
    return cat2num_train_df,cat2num_test_df

In [10]:
# Convert categorical features using one-hot encoding.
def onehot(cat2num_train_df, cat2num_test_df, column):
    
    global train_df, test_df
    
    dummies_train = pd.get_dummies(train_df[column], prefix="_" + column)
    dummies_test = pd.get_dummies(test_df[column], prefix="_" + column)

    return cat2num_train_df.join(dummies_train), cat2num_test_df.join(dummies_test)

In [14]:
# Convert categorical variables using Average All but One Encoding
def AvgAllButOne(cat2num_train_df, cat2num_test_df, column):
    global train_df, test_df, label_df
    
    column_df = pd.concat([train_df[[column]], label_df],axis=1)

    #For Training Data    
    for index, row in column_df.iterrows():
        category = row[column]
        cat_df = column_df.loc[column_df[column]==category,[column,'SalePrice']]
        cat_len = cat_df.shape[0]

        if(cat_len == 1):
            cat2num_train_df.loc[index, column + '_aabo'] = row['SalePrice']
        elif(cat_len > 1):
            cat2num_train_df.loc[index, column + '_aabo'] = (np.sum(cat_df['SalePrice']) - row['SalePrice']) / (cat_len-1)    

    #For Test data
    column_grps = column_df[[column,'SalePrice']].groupby(column).agg('mean')
    cat2num_test_df[column] = test_df[column]
    #print(column_grps)
    for category in column_grps.index:
        cat2num_test_df.loc[cat2num_test_df[column]==category,column + '_aabo'] = column_grps.loc[category,'SalePrice']
    
    cat2num_test_df.drop(column,axis=1,inplace=True)
    
    return cat2num_train_df,cat2num_test_df

    #train_cat.loc[train_cat[main_col] == category, main_col + '_freq'] = np.sum(train_cat_dummies[column])/train_cat_dummies.shape[0]
    #test_cat.loc[test_cat[main_col] == category, main_col + '_freq'] = np.sum(train_cat_dummies[column])/train_cat_dummies.shape[0]

In [12]:
# Convert categorical features using one-hot encoding.
def cat2num_main(cat2num_train_df, cat2num_test_df, column, fill_na_train,fill_na_test):
    
    global train_df, test_df
    
    print("FEATURE: %s" %(column))
    
    if fill_na_train is not None:
        train_df[column].fillna(fill_na_train, inplace=True)
    
    if fill_na_test is not None:
        test_df[column].fillna(fill_na_test, inplace=True)
        
    # Ridge Regression
    cat2num_train_df, cat2num_test_df = catridge(cat2num_train_df,cat2num_test_df,column)
    
    # One Hot Encoding
    cat2num_train_df, cat2num_test_df = onehot(cat2num_train_df,cat2num_test_df,column)
    
    # Average All But One
    cat2num_train_df, cat2num_test_df = AvgAllButOne(cat2num_train_df,cat2num_test_df,column)

    return cat2num_train_df, cat2num_test_df

In [15]:
cat2num_train_df = pd.DataFrame(index=train_df.index)
cat2num_test_df  = pd.DataFrame(index=test_df.index)

cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "MSSubClass", None,None)

FEATURE: MSSubClass




In [16]:
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "MSZoning", "RL", "RL")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "LotConfig", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "Neighborhood", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "Condition1", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "BldgType", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "HouseStyle", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "RoofStyle", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "Exterior1st", "VinylSd", "VinylSd")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "Exterior2nd", "VinylSd", "VinylSd")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "Foundation", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "SaleType", "WD", "WD")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "SaleCondition", "Normal", "Normal")

FEATURE: MSZoning




FEATURE: LotConfig
FEATURE: Neighborhood
FEATURE: Condition1
FEATURE: BldgType
FEATURE: HouseStyle
FEATURE: RoofStyle
FEATURE: Exterior1st
FEATURE: Exterior2nd
FEATURE: Foundation
FEATURE: SaleType
FEATURE: SaleCondition


In [17]:
# Fill in missing MasVnrType for rows that do have a MasVnrArea.
idx = (train_df["MasVnrArea"] != 0) & ((train_df["MasVnrType"] == "None") | (train_df["MasVnrType"].isnull()))
train_df.loc[idx, "MasVnrType"] = "BrkFace"
idx = (test_df["MasVnrArea"] != 0) & ((test_df["MasVnrType"] == "None") | (test_df["MasVnrType"].isnull()))
test_df.loc[idx, "MasVnrType"] = "BrkFace"
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df, cat2num_test_df, "MasVnrType", "None", "None")

# Also add the booleans from calc_df as dummy variables.
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "LotShape", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "LandContour", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "LandSlope", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "Electrical", "SBrkr", "SBrkr")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "GarageType", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "PavedDrive", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "MiscFeature", "None", "None")

# Features we can probably ignore (but want to include anyway to see
# if they make any positive difference).
# Definitely ignoring Utilities: all records are "AllPub", except for
# one "NoSeWa" in the train set and 2 NA in the test set.
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "Street", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "Alley", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "Condition2", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "RoofMatl", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "Heating", None, None)

FEATURE: MasVnrType




FEATURE: LotShape
FEATURE: LandContour
FEATURE: LandSlope
FEATURE: Electrical
FEATURE: GarageType
FEATURE: PavedDrive
FEATURE: MiscFeature
FEATURE: Street
FEATURE: Alley
FEATURE: Condition2
FEATURE: RoofMatl
FEATURE: Heating


In [18]:
# I have these as numerical variables too.
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "ExterQual", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "ExterCond", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "BsmtQual", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "BsmtCond", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "HeatingQC", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "KitchenQual", "TA", "TA")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "FireplaceQu", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "GarageQual", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "GarageCond", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "PoolQC", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "BsmtExposure", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "BsmtFinType1", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "BsmtFinType2", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "Functional", "Typ", "Typ")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "GarageFinish", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "Fence", "None", "None")
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "MoSold", None, None)

# Divide up the years between 1871 and 2010 in slices of 20 years.
year_map = pd.concat(pd.Series("YearBin" + str(i+1), index=range(1871+i*20,1891+i*20)) for i in range(0, 7))

train_df["GarageYrBltBin"] = train_df.GarageYrBlt.map(year_map)
train_df["GarageYrBltBin"].fillna("NoGarage", inplace=True)
train_df["YearBuiltBin"] = train_df.YearBuilt.map(year_map)
train_df["YearRemodAddBin"] = train_df.YearRemodAdd.map(year_map)

test_df["GarageYrBltBin"] = test_df.GarageYrBlt.map(year_map)
test_df["GarageYrBltBin"].fillna("NoGarage", inplace=True)
test_df["YearBuiltBin"] = test_df.YearBuilt.map(year_map)
test_df["YearRemodAddBin"] = test_df.YearRemodAdd.map(year_map)

cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "GarageYrBltBin", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "YearBuiltBin", None, None)
cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df,  "YearRemodAddBin", None, None)

FEATURE: ExterQual




FEATURE: ExterCond
FEATURE: BsmtQual
FEATURE: BsmtCond
FEATURE: HeatingQC
FEATURE: KitchenQual
FEATURE: FireplaceQu
FEATURE: GarageQual
FEATURE: GarageCond
FEATURE: PoolQC
FEATURE: BsmtExposure
FEATURE: BsmtFinType1
FEATURE: BsmtFinType2
FEATURE: Functional
FEATURE: GarageFinish
FEATURE: Fence
FEATURE: MoSold
FEATURE: GarageYrBltBin
FEATURE: YearBuiltBin
FEATURE: YearRemodAddBin


In [19]:
# Encode Neighborhood Bin Trian

train_df["NeighborhoodBin"] = neighborhood_bin_train["NeighborhoodBin"]
test_df["NeighborhoodBin"] = neighborhood_bin_test["NeighborhoodBin"]

cat2num_train_df,cat2num_test_df = cat2num_main(cat2num_train_df,cat2num_test_df, "NeighborhoodBin", None, None)
train_df_munged = train_df_munged.join(cat2num_train_df)
test_df_munged = test_df_munged.join(cat2num_test_df)

# Retain only columns appearing in both training and test munged data
feat_intersect = list(set.intersection(set(train_df_munged.columns.values),set(test_df_munged.columns.values)))
train_df_munged = train_df_munged[feat_intersect]
test_df_munged  = test_df_munged[feat_intersect]

FEATURE: NeighborhoodBin




In [20]:
test_df_munged['GarageYrBlt'] = test_df_munged['GarageYrBlt'].map(float)

In [21]:
train_df_munged.select_dtypes(include=["object"]).columns, test_df_munged.select_dtypes(include=["object"]).columns

(Index([], dtype='object'), Index([], dtype='object'))

In [22]:
print("train_df_munged Null values:")
print(np.sum(train_df_munged.loc[:,train_df_munged.isnull().any(axis=0)==True].isnull()))

print("test_df_munged Null values:")
print(np.sum(test_df_munged.loc[:,test_df_munged.isnull().any(axis=0)==True].isnull()))

train_df_munged = train_df_munged.fillna(pd.concat([train_df_munged]).median())
test_df_munged = test_df_munged.fillna(pd.concat([train_df_munged]).median())

print("train_df_munged Null values:")
print(np.sum(train_df_munged.loc[:,train_df_munged.isnull().any(axis=0)==True].isnull()))

print("test_df_munged Null values:")
print(np.sum(test_df_munged.loc[:,test_df_munged.isnull().any(axis=0)==True].isnull()))

train_df_munged Null values:
BsmtQual_ridge           4
MasVnrType_ridge         4
Foundation_ridge         4
MSZoning_ridge           4
SaleCondition_ridge      4
ExterCond_ridge          4
NeighborhoodBin_ridge    4
GarageFinish_ridge       4
BsmtExposure_ridge       4
GarageYrBltBin_ridge     4
GarageType_ridge         4
LotShape_ridge           4
PoolQC_ridge             4
Fence_ridge              4
LandContour_ridge        4
BsmtFinType1_ridge       4
LandSlope_ridge          4
RoofMatl_ridge           4
Functional_ridge         4
YearRemodAddBin_ridge    4
Electrical_ridge         4
Street_ridge             4
Neighborhood_ridge       4
BsmtFinType2_ridge       4
BldgType_ridge           4
MiscFeature_ridge        4
BsmtCond_ridge           4
YearBuiltBin_ridge       4
Exterior1st_ridge        4
Condition1_ridge         4
HeatingQC_ridge          4
KitchenQual_ridge        4
Alley_ridge              4
Condition2_ridge         4
HouseStyle_ridge         4
FireplaceQu_ridge        4

In [23]:
# Additional processing: scale the data.   
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_df_munged = scaler.fit_transform(train_df_munged)
test_df_munged = scaler.transform(test_df_munged)

In [None]:
train_df_munged.shape, test_df_munged.shape,label_df.shape

In [None]:
## Xgboost gridsearch
import sklearn
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import r2_score

xtreme_forest = xgb.XGBRegressor()

parameter_grid = {
                 'max_depth' : [4],
                 'n_estimators':[1000],
                 'min_child_weight': [1,2,3],
                 'learning_rate':[0.005,0.01,0.05],
                 'colsample_bytree':[0.1,0.3,0.5],
                 'subsample': [0.1,0.2,0.5],
                 'silent':[1],
                 }

cross_validation = sklearn.cross_validation.StratifiedKFold(np.array(label_df['SalePrice']), n_folds=5)

grid_search_xgboost = GridSearchCV(xtreme_forest,
                           param_grid=parameter_grid,
                           scoring= "neg_mean_squared_error",
                           cv=cross_validation)

grid_search_xgboost.fit(train_df_munged, label_df)

print('Best score: {}'.format(grid_search_xgboost.best_score_))
print('Best parameters: {}'.format(grid_search_xgboost.best_params_))



In [None]:
# XGBoost -- I did some "manual" cross-validation here but should really find
# these hyperparameters using CV. ;-)

xgb_regr = xgb.XGBRegressor(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=7200,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 seed=42,
                 silent=1)

xgb_regr.fit(train_df_munged, label_df)

# Run prediction on training set to get a rough idea of how well it does.
y_train_pred_xgb = xgb_regr.predict(train_df_munged)
y_test = label_df
print("XGBoost score on training set: ", rmse(y_test, y_train_pred_xgb))

# Run prediction on the Kaggle test set.
y_test_pred_xgb = xgb_regr.predict(test_df_munged)

In [None]:
grid_search_xgboost.best_score_

In [None]:
# Lasso Grid Search
from sklearn.linear_model import Lasso

lasso_grid = Lasso()

parameter_grid = {
                 'alpha': [0.001,0.01,0.1,0.5,1],
                 'fit_intercept': [True, False],
                 'normalize': [True,False]
                 }

cross_validation = sklearn.cross_validation.StratifiedKFold(np.array(label_df['SalePrice']), n_folds=10)

grid_search_lasso = GridSearchCV(lasso_grid,
                           param_grid=parameter_grid,
                           scoring= "neg_mean_squared_error",
                           cv=cross_validation)

grid_search_lasso.fit(train_df_munged, label_df)

print('Best score: {}'.format(grid_search_lasso.best_score_))
print('Best parameters: {}'.format(grid_search_lasso.best_params_))

In [None]:
grid_search_lasso.best_params_

In [None]:
y_train_pred_lasso = grid_search_lasso.best_estimator_.predict(train_df_munged)
print("Elastic Net score on training set: ", rmse(label_df,y_train_pred_elastic))
y_test_pred_lasso = grid_search_lasso.best_estimator_.predict(test_df_munged)

In [None]:
from sklearn.linear_model import Lasso

# I found this best alpha through cross-validation.
best_alpha = 0.00099

lasso_regr = Lasso(alpha=best_alpha, max_iter=50000)
lasso_regr.fit(train_df_munged, label_df)

# Run prediction on training set to get a rough idea of how well it does.
y_train_pred_lasso = lasso_regr.predict(train_df_munged)
y_test = label_df
print("Lasso score on training set: ", rmse(y_test, y_train_pred_lasso))

# Run prediction on the Kaggle test set.
y_test_pred_lasso = lasso_regr.predict(test_df_munged)

In [None]:
y_test_pred_lasso

Random Forests don't perform well with this dataset

In [None]:
# Extra Trees Regressor

et_regr = ExtraTreesRegressor(n_estimators=5000, max_depth=8, max_features='sqrt',n_jobs=-1)
et_regr.fit(train_df_munged, label_df)

# Run prediction on training set to get a rough idea of how well it does.
y_pred = et_regr.predict(train_df_munged)
y_test = label_df
print("Extra Trees Regressor score on training set: ", rmse(y_test, y_pred))

# Run prediction on the Kaggle test set.
y_pred_et = et_regr.predict(test_df_munged)

Extra Trees Regressor also no where in the range of performance of Lasso XGBoost

In [None]:
import sklearn
from sklearn.grid_search import GridSearchCV

# Kernel Ridge GridSearch
from sklearn.kernel_ridge import KernelRidge

kridge_grid = KernelRidge()

parameter_grid = {
                 'alpha': [0.0001,0.001,0.01,0.1,1,10,30,60],
                 'degree': [1,2,3,4],
                 'kernel': ['polynomial']
                 #'n_estimators': [200,210,240,250],
                 #'min_child_weight': [1,2,3,4]
                 }

cross_validation = sklearn.cross_validation.StratifiedKFold(np.array(label_df['SalePrice']), n_folds=10)

grid_search_kridge = GridSearchCV(kridge_grid,
                           param_grid=parameter_grid,
                           scoring= "neg_mean_squared_error",
                           cv=cross_validation)

grid_search_kridge.fit(train_df_munged, label_df)

print('Best score: {}'.format(grid_search_kridge.best_score_))
print('Best parameters: {}'.format(grid_search_kridge.best_params_))

In [None]:
y_train_pred_kridge = grid_search_kridge.best_estimator_.predict(train_df_munged)
print("Kernel Ridge score on training set: ", rmse(label_df,y_train_pred_kridge))
y_test_pred_kridge = grid_search_kridge.best_estimator_.predict(test_df_munged)

In [None]:
y_test_pred_kridge

Something wrong with test predictions using Kernel Ridge.

In [None]:
gbm_grid = GradientBoostingRegressor(max_features='sqrt')

parameter_grid = {
                 'max_depth' : [4,6,8],
                 'n_estimators': [5000],
                 'min_samples_split':[5,10,15,20]
                 }

cross_validation = sklearn.cross_validation.StratifiedKFold(label_df['SalePrice'], n_folds=5)

grid_search_gbm = GridSearchCV(gbm_grid,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search_gbm.fit(train_df_munged, label_df)

print('Best score: {}'.format(grid_search_gbm.best_score_))
print('Best parameters: {}'.format(grid_search_gbm.best_params_))

In [None]:
y_train_pred_gbm = grid_search_gbm.best_estimator_.predict(train_df_munged)
print("GBM score on training set: ", rmse(label_df,y_train_pred_gbm))
y_test_pred_gbm = grid_search_gbm.best_estimator_.predict(test_df_munged)

In [None]:
y_test_pred_kridge

## Blending using Weighted Average

In [None]:
## Analyzing the Weighting performances
## XGboost + Lasso + Kernel Ridge + GBM
rmse_arr = np.empty([5,5,5,5])

for wt_xgb in np.arange(0,5):
    for wt_lasso in np.arange(0,5):
        for wt_kridge in np.arange(0,5):
            for wt_gbm in np.arange(0,5):
                if(wt_lasso + wt_kridge + wt_xgb + wt_gbm == 0):
                    rmse_arr[wt_xgb,wt_lasso,wt_kridge,wt_gbm] = 0.1
                    continue
                #for wt_gbm in np.arange(0,4):
                y_pred_avg = (wt_xgb*np.ravel(y_train_pred_xgb) + wt_lasso*np.ravel(y_train_pred_lasso) + wt_kridge*np.ravel(y_train_pred_kridge) + wt_gbm*np.ravel(y_train_pred_gbm))
                y_pred_avg = y_pred_avg / (wt_xgb + wt_lasso + wt_kridge+wt_gbm)
                if rmse(label_df,y_pred_avg) < 0.0005:
                    rmse_arr[wt_xgb,wt_lasso,wt_kridge,wt_gbm] = 0.1
                else:    
                    rmse_arr[wt_xgb,wt_lasso,wt_kridge,wt_gbm] = rmse(label_df,y_pred_avg)
                print("RMSE Training Error for ::wt_xgb=%s, wt_lasso=%s, wt_kridge=%s, wt_gbm=%s:: = %s\n" %(str(wt_xgb), str(wt_lasso), str(wt_kridge), str(wt_gbm),str(1000*rmse(label_df,y_pred_avg))))

In [None]:
## Lasso + Kernel Ridge Weight Analysis
rmse_arr = np.zeros([5,5])

for wt_lasso in np.arange(0,5):
    for wt_kridge in np.arange(0,5):
        if(wt_lasso + wt_kridge == 0):
            rmse_arr[wt_lasso,wt_kridge] = 0.1
            continue
        #for wt_gbm in np.arange(0,4):
        y_pred_avg = (wt_lasso*np.ravel(y_train_pred_lasso) + wt_kridge*np.ravel(y_train_pred_kridge))
        y_pred_avg = y_pred_avg / (wt_lasso + wt_kridge)
        rmse_arr[wt_lasso,wt_kridge] = rmse(label_df,y_pred_avg)
        print("RMSE Training Error for ::wt_lasso=%s, wt_kridge=%s:: = %s\n" %(str(wt_lasso), str(wt_kridge),  str(1000*rmse(label_df,y_pred_avg))))

In [None]:
## Analyzing the error
ravel_rmse = 1000*np.ravel(rmse_arr)
ravel_rmse = ravel_rmse[ravel_rmse>1]
#sorted_rmse = sorted_rmse[sorted_rmse<100000]
print(np.max(ravel_rmse))
print(np.min(ravel_rmse))
fig = plt.figure(figsize=(15,15))
plt.plot(ravel_rmse)#, np.linspace(0,1,sorted_rmse.size))
#plt.xlim([0,50])

### Observations:

Training error:

1) decreases with weight increase of XGBoost

2) increases with weight increase of Lasso

3) decreases with weight increase of Kernel Ridge

4) increases with weight increase of Elastic Net

In [None]:
print(rmse_arr.shape)

print("Min = ", np.min(rmse_arr))
print(np.where(rmse_arr == np.min(rmse_arr)))

print("Max = ", np.max(rmse_arr))
print(np.where(rmse_arr == np.max(rmse_arr)))

In [None]:
y_final = (1*np.ravel(y_test_pred_xgb) + 0*np.ravel(y_test_pred_kridge) + 1*np.ravel(y_test_pred_gbm))/2
y_final.shape

## Blending using Kernel Ridge Regression

In [None]:
def logloss(attempt, actual, epsilon=1.0e-15):
    """Logloss, i.e. the score of the bioresponse competition.
    """
    attempt = np.clip(attempt, epsilon, 1.0-epsilon)
    return - np.mean(actual * np.log(attempt) +
                     (1.0 - actual) * np.log(1.0 - attempt))

In [None]:
np.random.seed(0)  # seed to shuffle the train set

n_folds = 10
verbose = True
shuffle = False

X_ens = np.array(train_df_munged)
y_ens = np.array(label_df['SalePrice'])
X_submission = np.array(test_df_munged)

if shuffle:
    idx = np.random.permutation(y.size)
    X_ens = train_df_munged[idx]
    y_ens = targets_munged[idx]

skf = list(StratifiedKFold(y_ens, n_folds))

clfs = [lasso_regr,
        xgb_regr,
        #RandomForestRegressor(n_estimators=5000, max_depth=4, max_features='sqrt',n_jobs=-1),
        #ExtraTreesRegressor(n_estimators=5000, max_depth=4,n_jobs=-1),
        GradientBoostingRegressor(learning_rate=0.05, subsample=0.5, max_features='sqrt',max_depth=4, n_estimators=5000),
        ]

In [None]:
## Creating train and test sets for blending

dataset_blend_train = np.zeros((X_ens.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

In [None]:
for j, clf in enumerate(clfs):
    print(j, clf)
    dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
    for i, (train_curr, test_curr) in enumerate(skf):
        print("Fold", i)
        X_train = X_ens[train_curr]
        y_train = y_ens[train_curr]
        X_test = X_ens[test_curr]
        y_test = y_ens[test_curr]
        clf.fit(X_train, y_train)
        y_submission = clf.predict(X_test)
        dataset_blend_train[test_curr, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict(X_submission)
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

print ("Blending over.")

In [None]:
#y_final = np.mean(dataset_blend_test,axis=1)
from sklearn.kernel_ridge import KernelRidge
dd = KernelRidge()
dd.get_params()


In [None]:
from sklearn.linear_model import Ridge, LinearRegression

linear_grid = LinearRegression()

parameter_grid = {
                 'fit_intercept': [True,False],
                 'normalize':[True,False]
                 #'n_estimators': [200,210,240,250],
                 #'min_child_weight': [1,2,3,4]
                 }

cross_validation = StratifiedKFold(np.array(label_df['SalePrice']), n_folds=5)

grid_search_linear = GridSearchCV(linear_grid,
                           param_grid=parameter_grid,
                           scoring= "r2",
                           cv=cross_validation)

grid_search_linear.fit(dataset_blend_train, y_ens)

print('Best score: {}'.format(grid_search_linear.best_score_))
print('Best parameters: {}'.format(grid_search_linear.best_params_))

#clf.fit(dataset_blend_train, y_ens)
#y_final = clf.predict(dataset_blend_test)

In [None]:
from sklearn.kernel_ridge import KernelRidge

kridge_grid = KernelRidge()

parameter_grid = {
                 'alpha': [0.0001,0.001,0.01,0.1],
                 'degree': [1,2,3,4],
                 'kernel': ['polynomial']
                 #'n_estimators': [200,210,240,250],
                 #'min_child_weight': [1,2,3,4]
                 }

cross_validation = StratifiedKFold(np.array(label_df['SalePrice']), n_folds=10)

grid_search_kridge = GridSearchCV(kridge_grid,
                           param_grid=parameter_grid,
                           scoring= "neg_mean_squared_error",
                           cv=cross_validation)

grid_search_kridge.fit(dataset_blend_train, y_ens)

print('Best score: {}'.format(grid_search_kridge.best_score_))
print('Best parameters: {}'.format(grid_search_kridge.best_params_))

In [None]:
grid_search_kridge.grid_scores_

In [None]:
import pickle

ensemble_clfs_0106 = clfs

with open('0106_objs.pckl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([train_df_munged,test_df_munged,ensemble_clfs_0106,dataset_blend_test,dataset_blend_train], f)

In [None]:
y_final

In [None]:
y_pred = np.exp(y_final)

# Final Conversion.
output_file = 'new_feat_xgboost_gbm_1_1'
final_file = '0112_'+ output_file +'.csv'

pred_df = pd.DataFrame(y_pred, index=test_df["Id"], columns=["SalePrice"])
pred_df.shape
pred_df.to_csv(path+final_file, header=True, index_label='Id')

In [None]:
### We can improve it by CV and stacking   