# House Price - Pipeline, GridSearch, CrossValidation

## 1. Import data and libraries

In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import make_pipeline

In [53]:
url = "https://drive.google.com/file/d/1EiiFJ7-Zb8-lyDcd2c_Kncnhasw6EbRX/view?usp=drive_linkg"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]

data = pd.read_csv(path)
data.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive
0,8450,65.0,856,3,0,0,2,0,0,0
1,9600,80.0,1262,3,1,0,2,298,0,0
2,11250,68.0,920,3,1,0,2,0,0,0
3,9550,60.0,756,3,1,0,3,0,0,0
4,14260,84.0,1145,4,1,0,3,192,0,0


## 2. Split X and y

In [54]:
y = data.pop("Expensive")

In [55]:
X = data

In [56]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotArea       1460 non-null   int64  
 1   LotFrontage   1201 non-null   float64
 2   TotalBsmtSF   1460 non-null   int64  
 3   BedroomAbvGr  1460 non-null   int64  
 4   Fireplaces    1460 non-null   int64  
 5   PoolArea      1460 non-null   int64  
 6   GarageCars    1460 non-null   int64  
 7   WoodDeckSF    1460 non-null   int64  
 8   ScreenPorch   1460 non-null   int64  
dtypes: float64(1), int64(8)
memory usage: 102.8 KB


In [58]:
X.isna().sum()

LotArea           0
LotFrontage     259
TotalBsmtSF       0
BedroomAbvGr      0
Fireplaces        0
PoolArea          0
GarageCars        0
WoodDeckSF        0
ScreenPorch       0
dtype: int64

### 2.1. Convert or drop strings

In [None]:
# another option: X_num = X.drop(columns=["column_name_1", "column_name_2"])

X_num = X.select_dtypes(include="number") # selects datatypes that are float or integer

## 3. Split Train and Test


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=31416)

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 772 to 1391
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotArea       1168 non-null   int64  
 1   LotFrontage   966 non-null    float64
 2   TotalBsmtSF   1168 non-null   int64  
 3   BedroomAbvGr  1168 non-null   int64  
 4   Fireplaces    1168 non-null   int64  
 5   PoolArea      1168 non-null   int64  
 6   GarageCars    1168 non-null   int64  
 7   WoodDeckSF    1168 non-null   int64  
 8   ScreenPorch   1168 non-null   int64  
dtypes: float64(1), int64(8)
memory usage: 91.2 KB


## EDA - Exploratory Data Analyis  - could be done here

In [None]:
# check and explore the train data

## 4. Build the pipeline

In [None]:
# initialize transformers & model
imputer = SimpleImputer()
scaler = StandardScaler()
dtree = DecisionTreeClassifier()


In [None]:
# create the pipeline
pipe = make_pipeline(imputer,
                     scaler,
                     dtree)

In [None]:
pipe

## 5. Define the SearchGrid

In [62]:
# create parameter grid
param_grid = {
    "simpleimputer__strategy":["constant", "mean", "median"], #"mean", "median"
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(2, 10),
    "decisiontreeclassifier__criterion":["gini", "entropy"],
    "decisiontreeclassifier__max_leaf_nodes": range(2, 10) # int so it needs numbers -- 2 is included and 10 is not included
}

In [None]:
# # create parameter grid
# param_grid = {
#     "simpleimputer__strategy":["constant", "mean", "median"], #"mean", "median"
#     "standardscaler__with_mean":[True, False],
#     "standardscaler__with_std":[True, False],
#     "decisiontreeclassifier__max_depth": range(2, 50),
#     "decisiontreeclassifier__min_samples_leaf": range(2, 30),
#     "decisiontreeclassifier__criterion":["gini", "entropy"]
# }

In [60]:
# for x in range(2, 10):
#   print(x)

2
3
4
5
6
7
8
9


## 6. Define the Cross-Validation

In [61]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [63]:
# define cross validation
rand_search = RandomizedSearchCV(pipe,
                      param_grid,
                      cv=10,       # 10 folds
                      verbose=1,   # not so much text
                      n_jobs=-2,
                      n_iter=10,
                      random_state=123)  # 100 combinations from our parametergrid

## 7. Build the model

In [64]:
# fit
rand_search.fit(X_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [65]:
# cross validation average accuracy
rand_search.best_score_

0.9203801945181256

In [66]:
# best parameters
rand_search.best_params_

{'standardscaler__with_std': False,
 'standardscaler__with_mean': True,
 'simpleimputer__strategy': 'constant',
 'decisiontreeclassifier__min_samples_leaf': 7,
 'decisiontreeclassifier__max_leaf_nodes': 9,
 'decisiontreeclassifier__max_depth': 6,
 'decisiontreeclassifier__criterion': 'gini'}

## 8. Accuracy

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# training accuracy ON the ENTIRE TRAIN-DATA
y_train_pred = rand_search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9366438356164384

In [None]:
# testing accuracy
y_test_pred = rand_search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9315068493150684

In [96]:
# again we can put it in a dataframe
pd.DataFrame(rand_search.best_estimator_.named_steps['decisiontreeclassifier'].feature_importances_, index = X_num.columns)[0].sort_values()

LotArea         0.000000
PoolArea        0.000000
ScreenPorch     0.000000
LotFrontage     0.035517
WoodDeckSF      0.036661
BedroomAbvGr    0.071784
TotalBsmtSF     0.112779
Fireplaces      0.141375
GarageCars      0.601884
Name: 0, dtype: float64

In [88]:
search.best_estimator_.named_steps['decisiontreeclassifier'].feature_importances_

array([0.        , 0.03339894, 0.13879229, 0.07290807, 0.1435895 ,
       0.        , 0.61131121, 0.        , 0.        ])

In [95]:
# again we can put it in a dataframe
pd.DataFrame(search.best_estimator_.named_steps['decisiontreeclassifier'].feature_importances_, index = X_num.columns)[0].sort_values()

LotArea         0.000000
PoolArea        0.000000
WoodDeckSF      0.000000
ScreenPorch     0.000000
LotFrontage     0.033399
BedroomAbvGr    0.072908
TotalBsmtSF     0.138792
Fireplaces      0.143589
GarageCars      0.611311
Name: 0, dtype: float64

## ADDITION: Now let's use the GridSearchCV to narrow down the parameter tuning

Best results from before:

- 'standardscaler__with_std': False,
- 'standardscaler__with_mean': True,
- 'simpleimputer__strategy': 'constant',
- 'decisiontreeclassifier__min_samples_leaf': 7,
- 'decisiontreeclassifier__max_leaf_nodes': 9,
- 'decisiontreeclassifier__max_depth': 6,
- 'decisiontreeclassifier__criterion': 'gini'}

In [70]:
# new param_grid
narrow_param_grid = {
    "simpleimputer__strategy":["constant", "median"], #"mean", "median"
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    "decisiontreeclassifier__max_depth": range(4, 8),
    "decisiontreeclassifier__min_samples_leaf": range(5, 10),
    "decisiontreeclassifier__criterion":["gini", "entropy"], # you could also just pass one value ["gini"]
    "decisiontreeclassifier__max_leaf_nodes": range(7, 10)
}

In [71]:
# define cross validation
search = GridSearchCV(pipe,
                      narrow_param_grid,
                      cv=10,      # number of folds
                      verbose=1,
                      n_jobs=-2)
#                      n_iter=100 # this is not a parameter of gridsearch

In [72]:
# fit
search.fit(X_train, y_train)

Fitting 10 folds for each of 960 candidates, totalling 9600 fits


In [73]:
# check best_params_ and maybe check best_score_
search.best_params_

{'decisiontreeclassifier__criterion': 'gini',
 'decisiontreeclassifier__max_depth': 5,
 'decisiontreeclassifier__max_leaf_nodes': 9,
 'decisiontreeclassifier__min_samples_leaf': 8,
 'simpleimputer__strategy': 'median',
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True}

In [74]:
search.best_score_

0.9246831712348955

In [75]:
# then we could evaluate on ENTIRE train-data (with.predict())
y_pred_train = search.predict(X_train)

accuracy_score(y_true=y_train, y_pred=y_pred_train)

0.9323630136986302

In [76]:
# then we can evaluate on Test data
y_pred_test = search.predict(X_test)
accuracy_score(y_true=y_test, y_pred=y_pred_test)

0.934931506849315

So we used Randomized Search to find the area in which we expect the optimum.

Then we used Grid Search to find the optimal parameters (in this local optimum).

In this case it did not make so much of a difference.

In [77]:
# from the randomized search
rand_search.best_score_


0.9203801945181256

In [83]:
# Grid search
search.best_score_

0.9246831712348955

## comparing results between Randomized Search and GridSearch

In [82]:
# training accuracy ON the ENTIRE TRAIN-DATA
y_train_pred = rand_search.predict(X_train)

print(f"entire_train_data_score: {accuracy_score(y_train, y_train_pred)}")

# testing accuracy
y_test_pred = rand_search.predict(X_test)

print(f"entire_test_data_score: {accuracy_score(y_test, y_test_pred)}")

entire_train_data_score: 0.9357876712328768
entire_test_data_score: 0.928082191780822


In [84]:
# then we could evaluate on ENTIRE train-data (with.predict())
y_pred_train = search.predict(X_train)
accuracy_score(y_true=y_train, y_pred=y_pred_train)

0.9323630136986302

In [85]:
# then we can evaluate on Test data
y_pred_test = search.predict(X_test)
accuracy_score(y_true=y_test, y_pred=y_pred_test)

0.934931506849315

In [86]:
# Cintias way of storing the scores in a table
Best_Score = [0.92, 0.93]
Train = [0.92, 0.91]
Test = [0.93, 0.93]

results = pd.DataFrame([Best_Score, Train, Test],
                                  columns = ["Rand", "Grid"],
                                  index = ["Best", "Train","Test"])

In [87]:
results

Unnamed: 0,Rand,Grid
Best,0.92,0.93
Train,0.92,0.91
Test,0.93,0.93


In [None]:
# the "best" refers to the best_score_ that is available after CV