In [1]:
import numpy as np
import pandas as pd

df_raw = pd.read_csv('train.csv')
df_raw.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [64]:
import numpy as np
import pandas as pd


from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from math import sqrt
from scipy import stats
from scipy.stats import norm
from sklearn.preprocessing import LabelEncoder


import matplotlib.pyplot as plt
import matplotlib as matplotlib
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')

def data_process(df_raw,remove_outlier = False,remove_hard_to_fit = False,linear_model = False, get_dummies=False, label_encode=False ):

	# Make a copy so the original dataframe will not be altered.
    df_processed = df_raw.copy()
    
    
	# Remove outliers.
    outlier_list = [524, 1299, 463, 31, 534, 1433, 739, 1159, 108, 1231, 971, 1424]
    df_processed = df_processed.drop(outlier_list)

    
    ## Missing values
    
    # 259 LotFrontage  - replace missing value with 0 
    df_processed.LotFrontage = df_processed.LotFrontage.fillna(0)

    # 1369 Alley - replace with None
    df_processed.Alley = df_processed.Alley.fillna('None')

    # 8 MasVnrType and MasVnrArea - replace MasVnrType with None and MasVnrArea with 0
    df_processed.MasVnrType = df_processed.MasVnrType.fillna('None')
    df_processed.MasVnrArea = df_processed.MasVnrArea.fillna(0)

    # 37 basement: BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2- replace with Nb
    df_processed.BsmtQual = df_processed.BsmtQual.fillna('Nb')
    df_processed.BsmtCond = df_processed.BsmtCond.fillna('Nb')
    df_processed.BsmtExposure = df_processed.BsmtExposure.fillna('Nb')
    df_processed.BsmtFinType1 = df_processed.BsmtFinType1.fillna('Nb')
    df_processed.BsmtFinType2 = df_processed.BsmtFinType2.fillna('Nb')
    df_processed.TotalBsmtSF = df_processed.TotalBsmtSF.fillna(0)
    

    # 690 FireplaceQu - replace with Nf
    df_processed.FireplaceQu = df_processed.FireplaceQu.fillna('Nf')

    # 81 Garage: GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond - replace with Ng and year with 0 
    df_processed.GarageType = df_processed.GarageType.fillna('Ng')
    df_processed.GarageFinish = df_processed.GarageFinish.fillna('Ng')
    df_processed.GarageQual = df_processed.GarageQual.fillna('Ng')
    df_processed.GarageCond = df_processed.GarageCond.fillna('Ng')
    df_processed.GarageYrBlt = df_processed.GarageYrBlt.fillna(0)

    # 1453 PoolQC - replace with Np
    df_processed.PoolQC = df_processed.PoolQC.fillna('Np')

    # 1179 Fence - replace with Nf
    df_processed.Fence = df_processed.Fence.fillna('Nf')

    # 1406 MiscFeature - replace with None    
    df_processed.MiscFeature = df_processed.MiscFeature.fillna('None')

    # 1 Electrical
    df_processed = df_processed[pd.notnull(df_processed.Electrical)]

    ## Combine columns and drop multicollinear columns 
    
    # combine bathroom quanlitity 
    df_processed['BsmtBath'] = df_processed.BsmtFullBath + df_processed.BsmtHalfBath * 0.5
    df_processed['Bath'] = df_processed.FullBath + df_processed.HalfBath * 0.5
    df_processed = df_processed.drop(['BsmtFullBath', 'BsmtHalfBath','FullBath','HalfBath'], axis=1)

    # drop TotalBsmtSF - multicollinearaty
    #df_processed = df_processed.drop(['TotalBsmtSF'], axis=1)

    # drop GrLivArea - multicollinearaty
    #df_processed = df_processed.drop(['GrLivArea'], axis=1)

    # drop GarageArea - higher correlation than GarageACars, results are better as well
    df_processed = df_processed.drop(['GarageArea'], axis=1) 
    
    
	# Feature Transformation - take the logarithm of the features.
    #Linear_Num_Cols = ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'LotArea', 'GarageArea', 'TotRmsAbvGrd', 'TotalSF', 'BsmtFinSF1']
    df_processed.SalePrice = np.log(df_processed.SalePrice)
    df_processed.GrLivArea = np.log(df_processed.GrLivArea)
    df_processed.TotalBsmtSF = np.log(df_processed.TotalBsmtSF+1)
#     df_processed.LotArea = np.log(df_processed.LotArea) -- performance decreases
#     df_processed.GarageArea = np.log(df_processed.GarageArea)



	# Categorical Features Processsing

	# MSSubClass processing - MSSubClass 20-90 contains only duplicate information with HouseStyle and YearBuilt.
    df_processed['MSSubClass'] = df_processed['MSSubClass'].replace(['20','30','40','45','50','60','70','75','80','85'], '0')

    # Convert numerical to categorical. 
    df_processed[['MSSubClass','OverallQual','OverallCond']] = df_processed[['MSSubClass','OverallQual','OverallCond']].astype(str)

    #Get Dummies 
    
    if get_dummies:
        df_processed = pd.get_dummies(df_processed, columns=df_processed.select_dtypes(include=['object']).columns, drop_first=True)
    
    
    #get label encoder. categorical data change to numerical values
    if label_encode:
        le = LabelEncoder()
        categorical_ordinal_col=df_processed.select_dtypes(include=['object']).columns.to_list()
        df_processed[categorical_ordinal_col]=df_processed[categorical_ordinal_col].apply(lambda col: le.fit_transform(col))

   #---Multiply features: 
#     df_processed["add_OverallGrade"] = df_processed["OverallQual"] * df_processed["OverallCond"]
#     df_processed["add_GarageGrade"] = df_processed["GarageQual"] * df_processed["GarageCond"]
#     df_processed["add_ExterGrade"] = df_processed["ExterQual"] * df_processed["ExterCond"]
#     df_processed["add_KitchenScore"] = df_processed["KitchenAbvGr"] * df_processed["KitchenQual"]
#     df_processed["add_FireplaceScore"] = df_processed["Fireplaces"] * df_processed["FireplaceQu"]
#     df_processed["add_GarageScore"] = df_processed["GarageArea"] * df_processed["GarageQual"]
#     df_processed["add_PoolScore"] = df_processed["PoolArea"] * df_processed["PoolQC"]
#     df_processed['add_GrLivArea*OvQual'] = df_processed['GrLivArea'] * df_processed['OverallQual']
#     df_processed['add_QualOverall*Exter*Kitch*Bsmt*Garage'] = df_processed['OverallQual'] * df_processed['ExterQual'] * df_processed['KitchenQual'] * df_processed['BsmtQual'] * df_processed['GarageQual']



    return df_processed



In [65]:
# categorical_ordinal_col=['Alley',
#  'BldgType',
#  'BsmtCond',
#  'BsmtExposure',
#  'BsmtFinType1',
#  'BsmtFinType2',
#  'BsmtQual',
#  'CentralAir',
#  'Condition1',
#  'Condition2',
#  'Electrical',
#  'ExterCond',
#  'ExterQual',
#  'Exterior1st',
#  'Exterior2nd',
#  'Fence',
#  'FireplaceQu',
#  'Foundation',
#  'Functional',
#  'GarageCond',
#  'GarageFinish',
#  'GarageQual',
#  'GarageType',
#  'Heating',
#  'HeatingQC',
#  'HouseStyle',
#  'KitchenQual',
#  'LandContour',
#  'LandSlope',
#  'LotConfig',
#  'LotShape',
#  'MSSubClass',
#  'MSZoning',
#  'MasVnrType',
#  'MiscFeature',
#  'Neighborhood',
#  'OverallCond',
#  'OverallQual',
#  'PavedDrive',
#  'PoolQC',
#  'RoofMatl',
#  'RoofStyle',
#  'SaleCondition',
#  'SaleType',
#  'Street',
#  'Utilities']

In [66]:
# #List of numerical columns
# numerical_col=['LotFrontage',
#  'LotArea',
#  'YearBuilt',
#  'YearRemodAdd',
#  'MasVnrArea',
#  'BsmtFinSF1',
#  'BsmtFinSF2',
#  'BsmtUnfSF',
#  '1stFlrSF',
#  '2ndFlrSF',
#  'LowQualFinSF',
#  'BedroomAbvGr',
#  'KitchenAbvGr',
#  'TotRmsAbvGrd',
#  'Fireplaces',
#  'GarageYrBlt',
#  'GarageCars',
#  'WoodDeckSF',
#  'OpenPorchSF',
#  'EnclosedPorch',
#  '3SsnPorch',
#  'ScreenPorch',
#  'PoolArea',
#  'MiscVal',
#  'MoSold',
#  'YrSold',
#  'SalePrice',
#  'BsmtBath',
#  'Bath']

In [106]:
#data prep
data_processed_label_encode=data_process(df_raw,label_encode=True)
data_processed_label_encode

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,BsmtBath,Bath
0,1,9,3,65.0,8450,1,1,3,3,0,...,4,1,0,2,2008,8,4,12.247694,1.0,2.5
1,2,4,3,80.0,9600,1,1,3,3,0,...,4,1,0,5,2007,8,4,12.109011,0.5,2.0
2,3,9,3,68.0,11250,1,1,0,3,0,...,4,1,0,9,2008,8,4,12.317167,1.0,2.5
3,4,10,3,60.0,9550,1,1,0,3,0,...,4,1,0,2,2006,8,0,11.849398,1.0,1.0
4,5,9,3,84.0,14260,1,1,0,3,0,...,4,1,0,12,2008,8,4,12.429216,1.0,2.5
5,6,8,3,85.0,14115,1,1,0,3,0,...,2,3,700,10,2009,8,4,11.870600,1.0,1.5
6,7,4,3,75.0,10084,1,1,3,3,0,...,4,1,0,8,2007,8,4,12.634603,1.0,2.0
7,8,9,3,0.0,10382,1,1,0,3,0,...,4,3,350,11,2009,8,4,12.206073,1.0,2.5
8,9,8,4,51.0,6120,1,1,3,3,0,...,4,1,0,4,2008,8,0,11.774520,0.0,2.0
9,10,3,3,50.0,7420,1,1,3,3,0,...,4,1,0,1,2008,8,4,11.678440,1.0,1.0


In [100]:
##Add MLFLOW
import mlflow
import mlflow.sklearn
import math
import logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)


#Randomized search CV with Random Forrest

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import warnings
import sys
from sklearn.model_selection import RandomizedSearchCV

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


# Split the data into training and test sets. (0.75, 0.25) split.
train, test = train_test_split(data_processed_label_encode)

# The predicted column is "SalePrice" .
train_x = train.drop(["SalePrice"], axis=1)
test_x = test.drop(["SalePrice"], axis=1)
train_y = train[["SalePrice"]]
test_y = test[["SalePrice"]]

def random_search_rf(n_estimators,
                     max_features, max_depth,
                     min_samples_split,min_samples_leaf):
    
    
    rf = RandomForestRegressor(oob_score=True)


    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                  }
    print(random_grid)

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
    rf_random.fit(train_x, train_y.values.ravel())

    print('random forrest best_params per random search')
    print (rf_random.best_params_)


    best_n_estimator=rf_random.best_params_.get('n_estimators')
    best_max_features=rf_random.best_params_.get("max_features")
    best_max_depth=rf_random.best_params_.get('max_depth')
    best_min_samples_split=rf_random.best_params_.get('min_samples_split')
    best_min_samples_leaf=rf_random.best_params_.get("best_min_samples_leaf")
    
    #Train random forest based on the best parameters found
    rf = RandomForestRegressor( n_estimators=best_n_estimator,
                max_features=best_max_features,
                max_depth=best_max_depth,
                min_samples_split=best_min_samples_split,
                min_samples_leaf=best_min_samples_split,
                oob_score =True)

    # Train the model on train.csv data
    rf.fit(train_x, train_y.values.ravel())
    y_pred_train = rf.predict(train_x)

    #calculate rsme for train data
    rmse = math.sqrt(sum((train_y['SalePrice'].to_numpy()-y_pred_train)**2)/len(train_x))
    print('best model parameter Rmse: ',rmse)

    mae = mean_absolute_error(train_y['SalePrice'].to_numpy(), y_pred_train)
    print ('best model parameter mae:', mae)

    r2 = r2_score(train_y['SalePrice'].to_numpy(), y_pred_train)
    print ('best model r2 score:', r2)

    oob = rf.oob_score_
    print('best model parameter Oob score: ',oob)
    oob_pred = rf.oob_prediction_
    rmse_oob = math.sqrt(sum((train_y['SalePrice'].to_numpy()-oob_pred)**2)/len(train_x))
    print('Rmse using oob prediction: ', rmse_oob)

    # #read and process test.csv
    # df_test=pd.read_csv('test.csv')
    # df_test_processed=data_process(df_test, label_encode=False) #It should be TRUE********
    # df_test_processed.drop('Id', axis=1, inplace=True)

    return (rmse, mae, r2, oob, rmse_oob,best_n_estimator,best_max_features, best_max_depth,best_min_samples_split,best_min_samples_leaf)


In [112]:
#run random_search_rf function with start parameters 



# Number of trees in random forest
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators=[100]
# Number of features to consider at every split
max_features = [45]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 10, 15]


random_search_rf(n_estimators,max_features, max_depth,min_samples_split,min_samples_leaf)

{'n_estimators': [100], 'max_features': [45], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4, 10, 15]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   42.9s finished


random forrest best_params per random search
{'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 45, 'max_depth': 20}
best model parameter Rmse:  0.06553619182080966
best model parameter mae: 0.041003928990732956
best model r2 score: 0.9732460383396584
best model parameter Oob score:  0.8691967731274746
Rmse using oob prediction:  0.14490934072663894


(0.06553619182080966,
 0.041003928990732956,
 0.9732460383396584,
 0.8691967731274746,
 0.14490934072663894,
 100,
 45,
 20,
 2,
 None)

In [114]:
##Add MLFLOW, grid search with random forrest
import mlflow
import mlflow.sklearn
import math
import logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)


#Grid Search with Random Forrest

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import warnings
import sys
from sklearn.model_selection import RandomizedSearchCV

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


# Split the data into training and test sets. (0.75, 0.25) split.
train, test = train_test_split(data_processed_label_encode)

# The predicted column is "SalePrice" .
train_x = train.drop(["SalePrice"], axis=1)
test_x = test.drop(["SalePrice"], axis=1)
train_y = train[["SalePrice"]]
test_y = test[["SalePrice"]]


def grid_search_rf(n_estimators,max_features, max_depth,min_samples_split,min_samples_leaf):

    rf = RandomForestRegressor(oob_score=True)

    
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                  }
    
    print(random_grid)

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = GridSearchCV(estimator = rf, param_grid = random_grid, verbose=2, n_jobs = -1)# Fit the random search model
    rf_random.fit(train_x, train_y.values.ravel())

    print('random forrest best_params per random search')
    print (rf_random.best_params_)

    best_n_estimator=rf_random.best_params_.get('n_estimators')
    best_max_features=rf_random.best_params_.get("max_features")
    best_max_depth=rf_random.best_params_.get('max_depth')
    best_min_samples_split=rf_random.best_params_.get('min_samples_split')
    best_min_samples_leaf=rf_random.best_params_.get("min_samples_leaf")
    
    #Train random forest based on the best parameters found
    rf = RandomForestRegressor( n_estimators=best_n_estimator,
                max_features=best_max_features,
               max_depth=best_max_depth,
               min_samples_split=best_min_samples_split,
               min_samples_leaf=best_min_samples_leaf,
               oob_score =True)

    # Train the model on train.csv data
    rf.fit(train_x, train_y.values.ravel())
    y_pred_train = rf.predict(train_x)

    #calculate rsme for train data
    rmse = math.sqrt(sum((train_y['SalePrice'].to_numpy()-y_pred_train)**2)/len(train_x))
    print('best model parameter Rmse: ',rmse)

    mae = mean_absolute_error(train_y['SalePrice'].to_numpy(), y_pred_train)
    print ('best model parameter mae:', mae)

    r2 = r2_score(train_y['SalePrice'].to_numpy(), y_pred_train)
    print ('best model r2 score:', r2)

    oob = rf.oob_score_
    print('best model parameter Oob score: ',oob)
    oob_pred = rf.oob_prediction_
    rmse_oob = math.sqrt(sum((train_y['SalePrice'].to_numpy()-oob_pred)**2)/len(train_x))
    print('Rmse using oob prediction: ', rmse_oob)

    # #read and process test.csv
    # df_test=pd.read_csv('test.csv')
    # df_test_processed=data_process(df_test, label_encode=False) #It should be TRUE********
    # df_test_processed.drop('Id', axis=1, inplace=True)

    return (rmse, mae, r2, oob, rmse_oob,best_n_estimator,best_max_features, best_max_depth,best_min_samples_split,best_min_samples_leaf)




In [115]:
# #Run grid search for random forest

# # Number of trees in random forest
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators=[100]
# Number of features to consider at every split
max_features = [45]
# Maximum number of levels in tree
max_depth = [90]
# Minimum number of samples required to split a node
min_samples_split = [2]
# Minimum number of samples required at each leaf node
min_samples_leaf  = [3]

grid_search_rf(n_estimators,max_features, max_depth,min_samples_split,min_samples_leaf)

{'n_estimators': [100], 'max_features': [45], 'max_depth': [90], 'min_samples_split': [2], 'min_samples_leaf': [3]}
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.4s finished


random forrest best_params per random search
{'max_depth': 90, 'max_features': 45, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 100}
best model parameter Rmse:  0.07830183604277392
best model parameter mae: 0.04806335344162513
best model r2 score: 0.9609287133176794
best model parameter Oob score:  0.8637117947583423
Rmse using oob prediction:  0.1462420361201077


(0.07830183604277392,
 0.04806335344162513,
 0.9609287133176794,
 0.8637117947583423,
 0.1462420361201077,
 100,
 45,
 90,
 2,
 3)