# Kaggle House Price competition

So we first need to do a lot of data cleaning and feature engineering. Then we will do some one hot encoding for the categorical variables. Then we will do some regressions! Easy as pie.

In [77]:
import pandas as pd
import numpy as np
data_train =  pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
data_test =  pd.read_csv('house-prices-advanced-regression-techniques/test.csv')
data_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


81 columns is a lot. Lets get rid of the ones with NaN.

In [78]:
columns_to_delete = set(data_train.columns[data_train.isna().any()].tolist()+data_test.columns[data_test.isna().any()].tolist())
new_data_train = data_train.drop(columns = columns_to_delete)
new_data_test = data_test.drop(columns = columns_to_delete)

Nice, got rid of those NaN infested columns - note that we did this for the training and test sets!

The columns where there 25 and 75 percentiles are probs not very good columns either. Lets get rid of them

In [79]:
columns_to_delete = np.array([])
description_df = new_data_train.describe()
for column in description_df:
    if description_df[column]['25%'] == description_df[column]['75%']:
        columns_to_delete = np.append(columns_to_delete,[column])
    elif description_df[column]['25%'] == description_df[column]['50%']:
        columns_to_delete = np.append(columns_to_delete,[column])
columns_to_delete
new_data_train = new_data_train.drop(columns = columns_to_delete)
new_data_test = new_data_test.drop(columns = columns_to_delete)
new_data_train.shape,new_data_test.shape

((1460, 36), (1459, 35))

This is much more manageable. Now i will keep the id variable to the side in case i need it later (not sure why i would?)


In [80]:
train_ids = new_data_train.Id
test_ids = new_data_test.Id
train_prices = new_data_train.SalePrice
new_data_train = new_data_train.drop(columns = ["Id", "SalePrice"])
new_data_test = new_data_test.drop(columns = ["Id"])
train_df = new_data_train
test_df = new_data_test
train_df.head()

Unnamed: 0,MSSubClass,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,GrLivArea,FullBath,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,PavedDrive,OpenPorchSF,MoSold,YrSold,SaleCondition
0,60,8450,Pave,Reg,Lvl,Inside,Gtl,CollgCr,Norm,Norm,...,1710,2,3,8,0,Y,61,2,2008,Normal
1,20,9600,Pave,Reg,Lvl,FR2,Gtl,Veenker,Feedr,Norm,...,1262,2,3,6,1,Y,0,5,2007,Normal
2,60,11250,Pave,IR1,Lvl,Inside,Gtl,CollgCr,Norm,Norm,...,1786,2,3,6,1,Y,42,9,2008,Normal
3,70,9550,Pave,IR1,Lvl,Corner,Gtl,Crawfor,Norm,Norm,...,1717,1,3,7,1,Y,35,2,2006,Abnorml
4,60,14260,Pave,IR1,Lvl,FR2,Gtl,NoRidge,Norm,Norm,...,2198,2,4,9,1,Y,84,12,2008,Normal


Currently we have both categorical data with strings and numerical data that is either continuous or discrete. I am going to assume here that the numerical data that is still discrete can still be good in a regression. Consider the number of bedrooms, you would expect to pay more for a house with more bedrooms, therefore, even though it is discrete because you cant get like 3.75 bedrooms, it is going to work in a regression because there is a linear relationship between the price and the number of bedrooms. 

Now we have to split up the categorical data and the non categorical data and treat them differently.

In [81]:
train_categorical_columns = [c for c in train_df.columns if train_df.dtypes[c] == 'object']
train_numerical_columns = [c for c in train_df.columns if train_df.dtypes[c] != 'object']

test_categorical_columns = [c for c in test_df.columns if test_df.dtypes[c] == 'object']
test_numerical_columns = [c for c in test_df.columns if test_df.dtypes[c] != 'object']

In [82]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encoder(train_data, test_data, col):
    test_dummies = pd.get_dummies(test_data[col])
    train_dummies = pd.get_dummies(train_data[col])
    # the problem is that the test dummies and train dummies might not be the same
    if set(test_dummies.columns) != set(train_dummies.columns):
        # first check which ones in train are missing compared to test
        to_add_to_train = []
        for value in set(test_dummies.columns):
            if value not in set(train_dummies.columns):
                to_add_to_train.append(value)
        # now check with ones in test are missing compared to train
        to_add_to_test = []
        for value in set(train_dummies.columns):
            if value not in set(test_dummies.columns):
                to_add_to_test.append(value)
        # Now we gotta add a column of zeroes for the missing ones
        for c in to_add_to_train:
            c_name = col+"_"+c
            train_dummies[c_name] = 0
        for c in to_add_to_test:
            c_name = col+"_"+c
            test_dummies[c_name] = 0
    # we need to change the col names because they are non unique which is messing with the matrix
    train_cols = train_dummies.columns
    new_col_names = []
    for current_train_col in train_cols:
        new_col_names.append(col+"_"+current_train_col)
    #print(new_col_names)
    test_dummies.columns = new_col_names
    train_dummies.columns = new_col_names
    # Ensure the order of column in the test set is in the same order than in train set
    test_dummies = test_dummies[train_dummies.columns]
    # join the one hot encoded data
    train_data = train_data.join(train_dummies)
    test_data = test_data.join(test_dummies)
    return train_data, test_data

In [83]:
for current_column in train_categorical_columns:
    temp_train_df,temp_test_df = one_hot_encoder(train_df, test_df, current_column)
    train_df = temp_train_df
    test_df = temp_test_df
# get rid of the categorical columns
train_df = train_df.drop(columns = train_categorical_columns)
test_df = test_df.drop(columns = test_categorical_columns)

Now the train and test dataframes are all either numerical or one hot encoded. We are ready for the regression! Let's start off with the linear regression.

In [85]:
from sklearn import linear_model 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score


regr = linear_model.LinearRegression()
X = train_df
y = train_prices
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
regr.fit(X_train, y_train) 
y_pred = regr.predict(X_test)

test_set_rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
test_set_r2 = r2_score(y_test, y_pred)
test_set_rmse, test_set_r2

(31824.96694535253, 0.867954961273517)

In [86]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 100, random_state = 0, max_depth=None, min_samples_split=2)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
test_set_rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
test_set_r2 = r2_score(y_test, y_pred)
test_set_rmse, test_set_r2

(30988.453161669873, 0.8748052946622769)

In [87]:
from sklearn.ensemble import GradientBoostingRegressor
gradreg = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls')
gradreg.fit(X_train, y_train)
y_pred = gradreg.predict(X_test)
test_set_rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
test_set_r2 = r2_score(y_test, y_pred)
test_set_rmse, test_set_r2

(37256.40313637306, 0.8190377387914285)

In [88]:
predictions = forest.predict(test_df)
submission_df = pd.DataFrame({'Id':test_ids,'SalePrice':predictions})
submission_df.to_csv('submission.csv')