## Column Definitions

Data comes from: https://www.kaggle.com/c/house-prices-advanced-regression-techniques

## Import Libraries

In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import eli5
import matplotlib.pyplot as plt
%matplotlib inline

from eli5.sklearn import PermutationImportance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

# Read data and drop "Id" column
df = pd.read_csv(r'C:/Users/mzhang40/data/housing_price/train.csv').drop(['Id'], axis=1)
df_evaluation = pd.read_csv(r'C:/Users/mzhang40/data/housing_price/test.csv')

df_evaluation_id = df_evaluation.Id
df_evaluation = df_evaluation.drop(['Id'], axis=1)

In [57]:
def featureEngineering(dataSet):
    # Re-map categorical variable
    dataSet.MSSubClass = dataSet.MSSubClass.map({20: '1-STORY 1946 & NEWER ALL STYLES', 
                                               30: '1-STORY 1945 & OLDER',
                                               40: '1-STORY W/FINISHED ATTIC ALL AGES',
                                               45: '1-1/2 STORY - UNFINISHED ALL AGES',
                                               50: '1-1/2 STORY FINISHED ALL AGES',
                                               60: '2-STORY 1946 & NEWER',
                                               70: '2-STORY 1945 & OLDER',
                                               75: '2-1/2 STORY ALL AGES',
                                               80: 'SPLIT OR MULTI-LEVEL',
                                               85: 'SPLIT FOYER',
                                               90: 'DUPLEX - ALL STYLES AND AGES',
                                               120: '1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
                                               150: '1-1/2 STORY PUD - ALL AGES',
                                               160: '2-STORY PUD - 1946 & NEWER',
                                               180: 'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
                                               190: '2 FAMILY CONVERSION - ALL STYLES AND AGES'})
    
    # Fill NaN with NA
    dataSet = pd.concat([dataSet.select_dtypes(exclude=['object']), 
                         dataSet.select_dtypes(include=['object']).replace({np.nan: 'NA'})], axis=1)
    
    return dataSet

In [21]:
df = featureEngineering(df)
df_evaluation = featureEngineering(df_evaluation)

# Create dummy variables
df_dummy = pd.concat([pd.get_dummies(df.drop(['SalePrice'], axis=1)), df.SalePrice], axis=1)
df_dummy_evaluation = pd.get_dummies(df_evaluation)

# Transform y
df.SalePrice = np.log(df.SalePrice)

## Hyper-Parameter Searching

In [3]:
pipe_impute_gbm = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('gbm', GradientBoostingRegressor())
])

param_grid = {
    'gbm__loss': ['ls'],
    'gbm__learning_rate': [0.01],
    'gbm__n_estimators': [7000, 8000, 9000],
    'gbm__max_depth': [2,3,4]
}

gbm_RSCV = RandomizedSearchCV(pipe_impute_gbm, 
                              param_grid, 
                              cv=5, 
                              n_iter=60, 
                              n_jobs=-1, 
                              scoring='neg_root_mean_squared_error',
                              verbose=2).fit(df_dummy.drop(['SalePrice'], axis=1), df_dummy.SalePrice)

In [6]:
gbm_RSCV.best_params_

{'gbm__n_estimators': 7000,
 'gbm__max_depth': 3,
 'gbm__loss': 'ls',
 'gbm__learning_rate': 0.01}

## Fit Model for Submission

In [35]:
y = df_dummy.SalePrice
df_dummy, df_dummy_evaluation = df_dummy.align(df_dummy_evaluation, join='inner', axis=1)

imputer_median = SimpleImputer(strategy='median')
df_impute= imputer_median.fit_transform(df_dummy)
df_evaluation_impute = imputer_median.transform(df_dummy_evaluation)

gb_model = GradientBoostingRegressor(n_estimators=7000,
                                     max_depth=3,
                                     loss='ls',
                                     learning_rate=0.01).fit(df_impute, y)
y_pred = gb_model.predict(df_evaluation_impute)

## Create Submission File

In [54]:
submit = pd.DataFrame({'Id': df_evaluation_id, 'SalePrice': y_pred})
submit.to_csv(r'C:/Users/mzhang40/data/housing_price/submission.csv', index=False)