In [188]:
import numpy as np
import pandas as pd


from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from math import sqrt
from scipy import stats
from scipy.stats import norm
# import labelencoder
from sklearn.preprocessing import LabelEncoder# instantiate labelencoder object

import matplotlib.pyplot as plt
import matplotlib as matplotlib
import seaborn as sns
%matplotlib inline



def data_process(df_raw,remove_outlier = False,remove_hard_to_fit = False,
                 linear_model = False,dummy=False, label_encode=False, Process_raw_data=False):

	# Make a copy so the original dataframe will not be altered.
    df_processed = df_raw.copy()
    
    
	# Remove outliers.
    outlier_list_scatter = [524, 1299]
    outlier_list_hard_to_fit = [463, 31, 534, 1433, 739, 1159, 108, 1231, 971, 1424 ]
    outlier_list = outlier_list_scatter + outlier_list_hard_to_fit
    df_processed = df_processed.drop(outlier_list)


    ## Missing values
    
    # 259 LotFrontage  - replace missing value with 0 
    df_processed.LotFrontage = df_processed.LotFrontage.fillna(0)

    # 1369 Alley - replace with None
    df_processed.Alley = df_processed.Alley.fillna('None')

    # 8 MasVnrType and MasVnrArea - replace MasVnrType with None and MasVnrArea with 0
    df_processed.MasVnrType = df_processed.MasVnrType.fillna('None')
    df_processed.MasVnrArea = df_processed.MasVnrArea.fillna(0)

    # 37 basement: BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2- replace with None
    df_processed.BsmtQual = df_processed.BsmtQual.fillna('None')
    df_processed.BsmtCond = df_processed.BsmtCond.fillna('None')
    df_processed.BsmtExposure = df_processed.BsmtExposure.fillna('None')
    df_processed.BsmtFinType1 = df_processed.BsmtFinType1.fillna('None')
    df_processed.BsmtFinType2 = df_processed.BsmtFinType2.fillna('None')
    df_processed.TotalBsmtSF = df_processed.TotalBsmtSF.fillna(0)
    

    # 690 FireplaceQu - replace with None
    df_processed.FireplaceQu = df_processed.FireplaceQu.fillna('None')

    # 81 Garage: GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond - replace with None and year with 0 
    df_processed.GarageType = df_processed.GarageType.fillna('None')
    df_processed.GarageFinish = df_processed.GarageFinish.fillna('None')
    df_processed.GarageQual = df_processed.GarageQual.fillna('None')
    df_processed.GarageCond = df_processed.GarageCond.fillna('None')
    df_processed.GarageYrBlt = df_processed.GarageYrBlt.fillna(0)

    # 1453 PoolQC - replace with None
    df_processed.PoolQC = df_processed.PoolQC.fillna('None')

    # 1179 Fence - replace with None
    df_processed.Fence = df_processed.Fence.fillna('None')

    # 1406 MiscFeature - replace with None    
    df_processed.MiscFeature = df_processed.MiscFeature.fillna('None')

    # 1 Electrical
    df_processed = df_processed[pd.notnull(df_processed.Electrical)]

    ## Combine columns and drop multicollinear columns 
    
    # combine bathroom quanlitity 
    df_processed['BsmtBath'] = df_processed.BsmtFullBath + df_processed.BsmtHalfBath * 0.5
    df_processed['Bath'] = df_processed.FullBath + df_processed.HalfBath * 0.5
    df_processed = df_processed.drop(['BsmtFullBath', 'BsmtHalfBath','FullBath','HalfBath'], axis=1)

    # drop TotalBsmtSF - multicollinearaty
    #df_processed = df_processed.drop(['TotalBsmtSF'], axis=1)

    # drop GrLivArea - multicollinearaty
    #df_processed = df_processed.drop(['GrLivArea'], axis=1)

    # drop GarageArea - higher correlation than GarageACars, results are better as well
    df_processed = df_processed.drop(['GarageArea'], axis=1) 
    
    
	# Feature Transformation - take the logarithm of the features.
    #Linear_Num_Cols = ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'LotArea', 'GarageArea', 'TotRmsAbvGrd', 'TotalSF', 'BsmtFinSF1']
    if Process_raw_data:
        df_processed.SalePrice = np.log(df_processed.SalePrice)
        
    df_processed.GrLivArea = np.log(df_processed.GrLivArea)
    df_processed.TotalBsmtSF = np.log(df_processed.TotalBsmtSF+1)
#     df_processed.LotArea = np.log(df_processed.LotArea) -- performance decreases
#     df_processed.GarageArea = np.log(df_processed.GarageArea) -- will drop column 



	# Categorical Features Processsing

	# MSSubClass processing - MSSubClass 20-90 contains only duplicate information with HouseStyle and YearBuilt.
    df_processed['MSSubClass'] = df_processed['MSSubClass'].replace(['20','30','40','45','50','60','70','75','80','85'], '0')

    # Convert numerical to categorical. 
#     df_processed[['MSSubClass','OverallQual','OverallCond']] = df_processed[['MSSubClass','OverallQual','OverallCond']].astype(str)
    df_processed['MSSubClass'] = df_processed['MSSubClass'].astype(str)

    #Encode some categorical features as ordered numbers when there is information in the order.
    df_processed = df_processed.replace({"Alley" : {"None":0,"Grvl" : 1, "Pave" : 2},
                       "BsmtCond" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "BsmtExposure" : {"None" : 0,"No":1, "Mn" : 2, "Av": 3, "Gd" : 4},
                       "BsmtFinType1" : {"None" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4,
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtFinType2" : {"None" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4,
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtQual" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                       "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "FireplaceQu" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5,
                                       "Min2" : 6, "Min1" : 7, "Typ" : 8},
                       "GarageCond" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "GarageQual" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                       "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                       "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                       "PoolQC" : {"None" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "Street" : {"Grvl" : 1, "Pave" : 2}})
    
#     df_processed["add_OverallGrade"] = df_processed["OverallQual"] * df_processed["OverallCond"]
#     df_processed["add_GarageGrade"] = df_processed["GarageQual"] * df_processed["GarageCond"]
#     df_processed["add_ExterGrade"] = df_processed["ExterQual"] * df_processed["ExterCond"]
#     df_processed["add_KitchenScore"] = df_processed["KitchenAbvGr"] * df_processed["KitchenQual"]
#     df_processed["add_FireplaceScore"] = df_processed["Fireplaces"] * df_processed["FireplaceQu"]
#     df_processed["add_PoolScore"] = df_processed["PoolArea"] * df_processed["PoolQC"]
#     df_processed['add_GrLivArea*OvQual'] = df_processed['GrLivArea'] * df_processed['OverallQual']
    
    #Get Dummies 
    if dummy:
        df_processed = pd.get_dummies(df_processed, columns=df_processed.select_dtypes(include=['object']).columns, drop_first=True)
    
    #get label encoder. categorical data change to numerical values
    if label_encode:
        le = LabelEncoder()
        categorical_col=df_processed.select_dtypes(include=['object']).columns.to_list()
        
        for label in categorical_col:
            df_processed[label] = le.fit_transform(df_processed[label])
        
    return df_processed

In [189]:

df_raw = pd.read_csv('train.csv')

df_raw_processed=data_process(df_raw,Process_raw_data=True,label_encode=True)


In [122]:
df_test=pd.read_csv('test.csv')
df_test_processed=data_process(df_test, label_encode=False) #It should be TRUE********

col with missing val: Index(['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'KitchenQual', 'Functional', 'GarageCars',
       'SaleType', 'BsmtBath'],
      dtype='object')


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,BsmtBath,Bath
0,1461,20,RH,80.0,11622,2,0,4,Lvl,AllPub,...,0,MnPrv,,0,6,2010,WD,Normal,0.0,1.0
1,1462,20,RL,81.0,14267,2,0,3,Lvl,AllPub,...,0,,Gar2,12500,6,2010,WD,Normal,0.0,1.5
2,1463,60,RL,74.0,13830,2,0,3,Lvl,AllPub,...,0,MnPrv,,0,3,2010,WD,Normal,0.0,2.5
3,1464,60,RL,78.0,9978,2,0,3,Lvl,AllPub,...,0,,,0,6,2010,WD,Normal,0.0,2.5
4,1465,120,RL,43.0,5005,2,0,3,HLS,AllPub,...,0,,,0,1,2010,WD,Normal,0.0,2.0
5,1466,60,RL,75.0,10000,2,0,3,Lvl,AllPub,...,0,,,0,4,2010,WD,Normal,0.0,2.5
6,1467,20,RL,0.0,7980,2,0,3,Lvl,AllPub,...,0,GdPrv,Shed,500,3,2010,WD,Normal,1.0,2.0
7,1468,60,RL,63.0,8402,2,0,3,Lvl,AllPub,...,0,,,0,5,2010,WD,Normal,0.0,2.5
8,1469,20,RL,85.0,10176,2,0,4,Lvl,AllPub,...,0,,,0,2,2010,WD,Normal,1.0,1.5
9,1470,20,RL,70.0,8400,2,0,4,Lvl,AllPub,...,0,MnPrv,,0,4,2010,WD,Normal,1.0,1.0


In [46]:
import numpy as np
import pandas as pd
import sklearn.ensemble
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor


In [192]:
from sklearn.ensemble import RandomForestRegressor
import math
def train_random_forest(n_estimators,
                        max_features,
                        min_samples_split,
                        min_samples_leaf,
                        max_depth,
                        oob_score=True, version_no=0):
    
    #read and process train.csv
    df_raw = pd.read_csv('train.csv')
    df_raw_processed=data_process(df_raw,Process_raw_data=True,label_encode=True,)

    ## separate the predictors and response in the training data set
    x_train = df_raw_processed.drop('SalePrice',axis=1)
    y_train = df_raw_processed.iloc[:, -1]
    
    x_train.drop('Id',axis=1,inplace=True)
    

    #Train random forest
    rf = RandomForestRegressor( n_estimators=n_estimators,
                max_features=max_features,
               max_depth=max_depth,
               min_samples_split=min_samples_split,
               min_samples_leaf=min_samples_leaf,
               oob_score =oob_score)
    
    # Train the model on train.csv data
    rf.fit(x_train, y_train)
    y_pred_train = rf.predict(x_train)
    
    #calculate rsme for train data
    rmse = math.sqrt(sum((y_train-y_pred_train)**2)/len(x_train))
    print('Rmse: ',rmse)
    
    if oob_score:
        oob = rf.oob_score_
        print('Oob score: ',oob)
        oob_pred = rf.oob_prediction_
        rmse_oob = math.sqrt(sum((y_train-oob_pred)**2)/len(x_train))
        print('Rmse using oob prediction: ', rmse_oob)

        
        
    #read and process test.csv
    df_test=pd.read_csv('test.csv')
    df_test_processed=data_process(df_test, label_encode=False) #It should be TRUE********
    df_test_processed.drop('Id', axis=1, inplace=True)
    
    #use the model to predict test.csv data
    y_test_predicted=rf.predict(df_test_processed)
         
    #Save Submisssion-------------------------------------------------------------------------------------------------------------------------
    file_name = './submissions/ver_'+str(version_no)+'_rf_'+str(n_estimators)+'_'+str(max_features)+'_'+str(max_depth)+'_'+str(rmse).csv'
    submission = pd.DataFrame({'Id':list(range(1461,2920)),'SalePrice':y_test_predicted})
    submission.to_csv(file_name,index=False)
        
        
    return rmse, rmse_oob
    

In [193]:
train_random_forest(200,2,2, 1, 20, False)

Rmse:  0.1019779131342308
Oob score:  0.8201825964223514
Rmse using oob prediction:  0.269718157942685


In [174]:
    #read and process train.csv
    df_raw = pd.read_csv('train.csv')
    df_raw_processed=data_process(df_raw,Process_raw_data=True,label_encode=True)

    ## separate the predictors and response in the training data set
    x_train = df_raw_processed.drop('SalePrice',axis=1)
    y_train = df_raw_processed.iloc[:, -1]
    x_train

['MSSubClass', 'MSZoning', 'LandContour', 'Utilities', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtCond', 'Heating', 'CentralAir', 'Electrical', 'GarageType', 'GarageFinish', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
col with missing val: Index([], dtype='object')


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,BsmtBath,Bath
0,1,9,3,65.0,8450,2,0,4,3,0,...,0,4,1,0,2,2008,8,4,1.0,2.5
1,2,4,3,80.0,9600,2,0,4,3,0,...,0,4,1,0,5,2007,8,4,0.5,2.0
2,3,9,3,68.0,11250,2,0,3,3,0,...,0,4,1,0,9,2008,8,4,1.0,2.5
3,4,10,3,60.0,9550,2,0,3,3,0,...,0,4,1,0,2,2006,8,0,1.0,1.0
4,5,9,3,84.0,14260,2,0,3,3,0,...,0,4,1,0,12,2008,8,4,1.0,2.5
5,6,8,3,85.0,14115,2,0,3,3,0,...,0,2,3,700,10,2009,8,4,1.0,1.5
6,7,4,3,75.0,10084,2,0,4,3,0,...,0,4,1,0,8,2007,8,4,1.0,2.0
7,8,9,3,0.0,10382,2,0,3,3,0,...,0,4,3,350,11,2009,8,4,1.0,2.5
8,9,8,4,51.0,6120,2,0,4,3,0,...,0,4,1,0,4,2008,8,0,0.0,2.0
9,10,3,3,50.0,7420,2,0,4,3,0,...,0,4,1,0,1,2008,8,4,1.0,1.0
