# Load libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
%matplotlib inline
pd.set_option("display.max_columns", 500)

# Load data

In [22]:
raw_training_df = pd.read_csv('train.csv')
print("Num examples = {}".format(raw_training_df.shape[0]))

raw_test_df = pd.read_csv('test.csv')
print("Num test examples = {}".format(raw_test_df.shape[0]))

all_X = raw_training_df.loc[:, 'MSSubClass':'SaleCondition']
Y = raw_training_df['SalePrice']

all_X = all_X.append(raw_test_df)

# drop id column
all_X.drop('Id', axis = 1, inplace = True)

print ("Num features: {}".format(all_X.shape[1]))

Num examples = 1460
Num test examples = 1459
Num features: 79


# Clean up messed up data

In [23]:
def handle_nans(df, column, val):
    return df[column].fillna(value=val)

all_X['LotFrontage'] = handle_nans(all_X, 'LotFrontage', 0.0)
all_X['MasVnrArea'] = handle_nans(all_X, 'MasVnrArea', 0.0)
all_X['GarageYrBlt'] = handle_nans(all_X, 'GarageYrBlt', 0.0)
all_X['BsmtFullBath'] = handle_nans(all_X, 'BsmtFullBath', 0.0)
all_X['BsmtHalfBath'] = handle_nans(all_X, 'BsmtHalfBath', 0.0)
all_X['GarageCars'] = handle_nans(all_X, 'GarageCars', 0.0)

# sample data
all_X.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,CentralAir,Condition1,Condition2,Electrical,EnclosedPorch,ExterCond,ExterQual,Exterior1st,Exterior2nd,Fence,FireplaceQu,Fireplaces,Foundation,FullBath,Functional,GarageArea,GarageCars,GarageCond,GarageFinish,GarageQual,GarageType,GarageYrBlt,GrLivArea,HalfBath,Heating,HeatingQC,HouseStyle,KitchenAbvGr,KitchenQual,LandContour,LandSlope,LotArea,LotConfig,LotFrontage,LotShape,LowQualFinSF,MSSubClass,MSZoning,MasVnrArea,MasVnrType,MiscFeature,MiscVal,MoSold,Neighborhood,OpenPorchSF,OverallCond,OverallQual,PavedDrive,PoolArea,PoolQC,RoofMatl,RoofStyle,SaleCondition,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,,3,1Fam,TA,No,706.0,0.0,GLQ,Unf,1.0,0.0,Gd,150.0,Y,Norm,Norm,SBrkr,0,TA,Gd,VinylSd,VinylSd,,,0,PConc,2,Typ,548.0,2.0,TA,RFn,TA,Attchd,2003.0,1710,1,GasA,Ex,2Story,1,Gd,Lvl,Gtl,8450,Inside,65.0,Reg,0,60,RL,196.0,BrkFace,,0,2,CollgCr,61,5,7,Y,0,,CompShg,Gable,Normal,WD,0,Pave,8,856.0,AllPub,0,2003,2003,2008
1,1262,0,0,,3,1Fam,TA,Gd,978.0,0.0,ALQ,Unf,0.0,1.0,Gd,284.0,Y,Feedr,Norm,SBrkr,0,TA,TA,MetalSd,MetalSd,,TA,1,CBlock,2,Typ,460.0,2.0,TA,RFn,TA,Attchd,1976.0,1262,0,GasA,Ex,1Story,1,TA,Lvl,Gtl,9600,FR2,80.0,Reg,0,20,RL,0.0,,,0,5,Veenker,0,8,6,Y,0,,CompShg,Gable,Normal,WD,0,Pave,6,1262.0,AllPub,298,1976,1976,2007
2,920,866,0,,3,1Fam,TA,Mn,486.0,0.0,GLQ,Unf,1.0,0.0,Gd,434.0,Y,Norm,Norm,SBrkr,0,TA,Gd,VinylSd,VinylSd,,TA,1,PConc,2,Typ,608.0,2.0,TA,RFn,TA,Attchd,2001.0,1786,1,GasA,Ex,2Story,1,Gd,Lvl,Gtl,11250,Inside,68.0,IR1,0,60,RL,162.0,BrkFace,,0,9,CollgCr,42,5,7,Y,0,,CompShg,Gable,Normal,WD,0,Pave,6,920.0,AllPub,0,2001,2002,2008
3,961,756,0,,3,1Fam,Gd,No,216.0,0.0,ALQ,Unf,1.0,0.0,TA,540.0,Y,Norm,Norm,SBrkr,272,TA,TA,Wd Sdng,Wd Shng,,Gd,1,BrkTil,1,Typ,642.0,3.0,TA,Unf,TA,Detchd,1998.0,1717,0,GasA,Gd,2Story,1,Gd,Lvl,Gtl,9550,Corner,60.0,IR1,0,70,RL,0.0,,,0,2,Crawfor,35,5,7,Y,0,,CompShg,Gable,Abnorml,WD,0,Pave,7,756.0,AllPub,0,1915,1970,2006
4,1145,1053,0,,4,1Fam,TA,Av,655.0,0.0,GLQ,Unf,1.0,0.0,Gd,490.0,Y,Norm,Norm,SBrkr,0,TA,Gd,VinylSd,VinylSd,,TA,1,PConc,2,Typ,836.0,3.0,TA,RFn,TA,Attchd,2000.0,2198,1,GasA,Ex,2Story,1,Gd,Lvl,Gtl,14260,FR2,84.0,IR1,0,60,RL,350.0,BrkFace,,0,12,NoRidge,84,5,8,Y,0,,CompShg,Gable,Normal,WD,0,Pave,9,1145.0,AllPub,192,2000,2000,2008


# Explore data

In [29]:
sns.set(color_codes=True)

# sns.distplot(all_X['MiscVal'])
# all_X['MiscVal'].describe()

# raw_training_df.plot.scatter(x='MiscVal', y='SalePrice')
# (all_X["1stFlrSF"] + all_X["2ndFlrSF"]).describe()

print("Find most important features relative to target. These are candidates for polynomial features")
corr = raw_training_df.corr()
corr.sort_values(["SalePrice"], ascending = False, inplace = True)
print(corr.SalePrice)

Find most important features relative to target. These are candidates for polynomial features
SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077

# Feature transformation

In [24]:
def feature_categories(df):
        df = df.replace({"ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}}
                     )
        df['FireplaceQu'] = handle_nans(df, 'FireplaceQu', 0.0)
        df['PoolQC'] = handle_nans(df, 'PoolQC', 0.0)
        df['GarageCond'] = handle_nans(df, 'GarageCond', 0.0)
        df['GarageQual'] = handle_nans(df, 'GarageQual', 0.0)
        df['KitchenQual'] = handle_nans(df, 'KitchenQual', 0.0)
        df['Utilities'] = handle_nans(df, 'Utilities', 0.0)
        return df

def feature_combinations(df):
    # Overall quality of the house
    df["OverallGrade"] = df["OverallQual"] * df["OverallCond"]
    # Overall quality of the garage
    df["GarageGrade"] = df["GarageQual"] * df["GarageCond"]
    # Overall quality of the exterior
    df["ExterGrade"] = df["ExterQual"] * df["ExterCond"]
    # Overall kitchen score
    df["KitchenScore"] = df["KitchenAbvGr"] * df["KitchenQual"]
    # Overall fireplace score
    df["FireplaceScore"] = df["Fireplaces"] * df["FireplaceQu"]
    # Overall garage score
    df["GarageScore"] = df["GarageArea"] * df["GarageQual"]
    # Overall pool score
    df["PoolScore"] = df["PoolArea"] * df["PoolQC"]
    # Total number of bathrooms
    df["TotalBath"] = df["BsmtFullBath"] + (0.5 * df["BsmtHalfBath"]) + \
    df["FullBath"] + (0.5 * df["HalfBath"])
    # Total SF for house (incl. basement)
    df["AllSF"] = df["GrLivArea"] + df["TotalBsmtSF"]
    # Total SF for 1st + 2nd floors
    df["AllFlrsSF"] = df["1stFlrSF"] + df["2ndFlrSF"]
    # Total SF for porch
    df["AllPorchSF"] = df["OpenPorchSF"] + df["EnclosedPorch"] + \
    df["3SsnPorch"] + df["ScreenPorch"]
    return df

def polynomial_features(df):
    df["OverallQual-s2"] = df["OverallQual"] ** 2
    df["OverallQual-s3"] = df["OverallQual"] ** 3
    df["OverallQual-Sq"] = np.sqrt(df["OverallQual"])
    df["AllSF-2"] = df["AllSF"] ** 2
    df["AllSF-3"] = df["AllSF"] ** 3
    df["AllSF-Sq"] = np.sqrt(df["AllSF"])
    df["AllFlrsSF-2"] = df["AllFlrsSF"] ** 2
    df["AllFlrsSF-3"] = df["AllFlrsSF"] ** 3
    df["AllFlrsSF-Sq"] = np.sqrt(df["AllFlrsSF"])
    df["GrLivArea-2"] = df["GrLivArea"] ** 2
    df["GrLivArea-3"] = df["GrLivArea"] ** 3
    df["GrLivArea-Sq"] = np.sqrt(df["GrLivArea"])
    df["ExterQual-2"] = df["ExterQual"] ** 2
    df["ExterQual-3"] = df["ExterQual"] ** 3
    df["ExterQual-Sq"] = np.sqrt(df["ExterQual"])
    df["GarageCars-2"] = df["GarageCars"] ** 2
    df["GarageCars-3"] = df["GarageCars"] ** 3
    df["GarageCars-Sq"] = np.sqrt(df["GarageCars"])
    df["TotalBath-2"] = df["TotalBath"] ** 2
    df["TotalBath-3"] = df["TotalBath"] ** 3
    df["TotalBath-Sq"] = np.sqrt(df["TotalBath"])
    df["KitchenQual-2"] = df["KitchenQual"] ** 2
    df["KitchenQual-3"] = df["KitchenQual"] ** 3
    df["KitchenQual-Sq"] = np.sqrt(df["KitchenQual"])
    df["GarageScore-2"] = df["GarageScore"] ** 2
    df["GarageScore-3"] = df["GarageScore"] ** 3
    df["GarageScore-Sq"] = np.sqrt(df["GarageScore"])
    return df


In [25]:
def one_hot(df, column_name):
    # use pd.concat to join the new columns with your original dataframe
    df = pd.concat([df,pd.get_dummies(df[column_name], prefix=column_name)],axis=1)
    df.drop(column_name, axis = 1, inplace = True)
    return df

def bucketize(df, column_name, buckets=10):
    categories = pd.cut(df[column_name], buckets, labels=['{}'.format(i) for i in range(buckets)])
    df = pd.concat([df,pd.get_dummies(categories, prefix=column_name)],axis=1)
    df.drop(column_name, axis = 1, inplace = True)
    return df

def custom_buckets(df, column_name, bucket_range):
    categories = pd.cut(df[column_name], bucket_range, labels=['{}'.format(i) for i in range(bucket_range.size - 1)])
    df = pd.concat([df,pd.get_dummies(categories, prefix=column_name)],axis=1)
    df.drop(column_name, axis = 1, inplace = True)
    return df

def handle_categories(X):
    # handle categories
    X = one_hot(X, 'MSSubClass')
    X = one_hot(X, 'MSZoning')
    X = one_hot(X, 'Street')
    X = one_hot(X, 'Alley')
    X = one_hot(X, 'LotShape')
    X = one_hot(X, 'LandContour')
    X = one_hot(X, 'LotConfig')
    X = one_hot(X, 'LandSlope')
    X = one_hot(X, 'Neighborhood')
    # maybe combine these 2
    X = one_hot(X, 'Condition1')
    X = one_hot(X, 'Condition2')
    X = one_hot(X, 'BldgType')
    X = one_hot(X, 'HouseStyle')
    X = one_hot(X, 'RoofStyle')
    X = one_hot(X, 'RoofMatl')
    # combine?
    X = one_hot(X, 'Exterior1st')
    X = one_hot(X, 'Exterior2nd')
    X = one_hot(X, 'MasVnrType')
    X = one_hot(X, 'Foundation')
    X = one_hot(X, 'BsmtQual')
    X = one_hot(X, 'BsmtCond')
    X = one_hot(X, 'BsmtExposure')
    X = one_hot(X, 'BsmtFinType1')
    X = one_hot(X, 'BsmtFinType2')
    X = one_hot(X, 'Heating')
    X = one_hot(X, 'HeatingQC')
    X = one_hot(X, 'CentralAir')
    X = one_hot(X, 'Electrical')
    X = one_hot(X, 'GarageType')
    X = one_hot(X, 'GarageFinish')
    X = one_hot(X, 'PavedDrive')
    X = one_hot(X, 'Fence')
    X = one_hot(X, 'MiscFeature')
    X = one_hot(X, 'SaleType')
    X = one_hot(X, 'SaleCondition')
    X = one_hot(X, 'Functional')
    return X

def handle_numerics(X):
    # handle numerics
    X = bucketize(X, 'LotFrontage')
    X['LotArea'] = np.log(X['LotArea']) # maybe uneven splits is better
    X = bucketize(X, 'YearBuilt')
    X = bucketize(X, 'YrSold')
    X = bucketize(X, 'YearRemodAdd')
    X = bucketize(X, 'MasVnrArea') # maybe uneven splits is better
    X = custom_buckets(X, 'BsmtFinSF1', np.append(np.linspace(-1, 2000, 10), 6000))
    X = bucketize(X, 'BsmtFinSF2')
    X = bucketize(X, 'BsmtUnfSF')
    X = custom_buckets(X, 'TotalBsmtSF', np.append(np.linspace(-1, 3500, 10), 7000))
    X = custom_buckets(X, '1stFlrSF', np.append(np.linspace(-1, 3000, 10), 5000))
    X = bucketize(X, '2ndFlrSF')
    X = bucketize(X, 'LowQualFinSF')
    X = custom_buckets(X, 'GrLivArea', np.append(np.linspace(300, 4000, 10), 6000))
    X = custom_buckets(X, 'GarageYrBlt', np.append([-1], np.linspace(1850, 2011, 10)))
    X = bucketize(X, 'GarageArea')
    X = bucketize(X, 'WoodDeckSF')
    X = bucketize(X, 'OpenPorchSF')
    X = bucketize(X, 'EnclosedPorch')
    X = bucketize(X, '3SsnPorch')
    X = bucketize(X, 'ScreenPorch')
    X = bucketize(X, 'PoolArea')
    X = bucketize(X, 'MiscVal')
    X = bucketize(X, 'GarageScore')
    X = bucketize(X, 'PoolScore')
    X = bucketize(X, 'AllPorchSF')
    X = custom_buckets(X, 'AllSF', np.append(np.linspace(300, 400, 10), 12000))
    X = custom_buckets(X, 'AllFlrsSF', np.append(np.linspace(300, 3000, 10), 6000))
    X = bucketize(X, "AllSF-2")
    X = bucketize(X, "AllSF-3")
    X = bucketize(X, "AllSF-Sq")
    X = bucketize(X, 'AllFlrsSF-2')
    X = bucketize(X, 'AllFlrsSF-3')
    X = bucketize(X, 'AllFlrsSF-Sq')
    X = bucketize(X, 'GrLivArea-2')
    X = bucketize(X, 'GrLivArea-3')
    X = bucketize(X, 'GrLivArea-Sq')
    X = bucketize(X, 'GarageScore-2')
    X = bucketize(X, 'GarageScore-3')
    X = bucketize(X, 'GarageScore-Sq')
    return X

all_X = feature_categories(all_X)
all_X = feature_combinations(all_X)
all_X = polynomial_features(all_X)

print ("Shape before feature transformation: {}".format(all_X.shape))
all_X = handle_categories(all_X)
print ("Shape after category transformation: {}".format(all_X.shape))
all_X = handle_numerics(all_X)
print ("Shape at the end of feature transformation: {}".format(all_X.shape))

Shape before feature transformation: (2919, 117)
Shape after category transformation: (2919, 316)
Shape at the end of feature transformation: (2919, 667)


# Break up data into training, cv and test

In [7]:
X_train = all_X.iloc[0:1460]
Y_train = Y.iloc[0:1460]

X_test = all_X.iloc[1460:]

print ("X training data shape: {}".format(X_train.shape))
print ("Y training data shape: {}".format(Y_train.shape))
print ("X test data shape: {}".format(X_test.shape))

X training data shape: (1460, 667)
Y training data shape: (1460,)
X test data shape: (1459, 667)


# Train Ridge model

In [8]:
ridge = linear_model.RidgeCV(alphas = [0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60], cv = 10)
ridge = ridge.fit(X_train, Y_train)

alpha = ridge.alpha_
print ("Best alpha : {}".format(alpha))

print ("Training score({}): {}".format(alpha, ridge.score(X_train, Y_train)))

# explore alpha values around best
alphas = [alpha*0.65, alpha*0.70, alpha*0.75, alpha*.8, alpha*.85, alpha*0.9, alpha*.95, alpha, alpha*1.05, alpha*1.1, alpha*1.15, alpha*1.2, alpha*1.25, alpha*1.3, alpha*1.35]
ridge = linear_model.RidgeCV(alphas = alphas, cv = 10)
ridge = ridge.fit(X_train, Y_train)

alpha = ridge.alpha_
print ("Refined Best alpha : {}".format(alpha))

print ("Training score({}): {}".format(alpha, ridge.score(X_train, Y_train)))

Best alpha : 10
Training score(10): 0.948462901443
Refined Best alpha : 8.0
Training score(8.0): 0.950939338142


# Run on test csv

In [484]:
y_pred = ridge.predict(X_test)
d = {'Id': [i for i in range(1461, 2920)], 'SalePrice': y_pred}
y_df = pd.DataFrame(data=d)
y_df.to_csv('submission.csv', index=False)