In [1]:
import pandas as pd

url = "https://drive.google.com/file/d/1tVTh77_j7aettNGZlk3gT1yZXsZLTZyp/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
house_df = pd.read_csv(path)

In [2]:
house_df.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive
0,8450,65.0,856,3,0,0,2,0,0,0
1,9600,80.0,1262,3,1,0,2,298,0,0
2,11250,68.0,920,3,1,0,2,0,0,0
3,9550,60.0,756,3,1,0,3,0,0,0
4,14260,84.0,1145,4,1,0,3,192,0,0


In [3]:
X = house_df.copy()

In [4]:
y= X.pop("Expensive")

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# 1. initialize transformers & model
imputer = SimpleImputer()
dtree = DecisionTreeClassifier()
s_scaler = StandardScaler()

# 2. add parameters
param_grid = {
    'standardscaler__with_mean': [True, False],
    'simpleimputer__strategy': ['mean', 'constant'],
    'decisiontreeclassifier__max_depth': range(2, 12),
    'decisiontreeclassifier__min_samples_leaf': range(3, 10, 2),
    'decisiontreeclassifier__min_samples_split': range(3, 40, 5),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
    }

# 3. Create a pipeline*
pipe = make_pipeline(imputer, s_scaler, dtree)

# 4. Bring it all together for a gridsearch
search = GridSearchCV(pipe, # you have defined this beforehand
                      param_grid, # your parameter grid
                      cv=7, # the value for K in K-fold Cross Validation
                      scoring='accuracy', # the performance metric to use, 
                      verbose=1) # we want informative outputs during the training process
                      # n_iter = 50) # number of iterations for randomized search

# 5. Fit the pipeline to the training data
search.fit(X_train, y_train)

Fitting 7 folds for each of 2560 candidates, totalling 17920 fits


GridSearchCV(cv=7,
             estimator=Pipeline(steps=[('simpleimputer', SimpleImputer()),
                                       ('standardscaler', StandardScaler()),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             param_grid={'decisiontreeclassifier__criterion': ['gini',
                                                               'entropy'],
                         'decisiontreeclassifier__max_depth': range(2, 12),
                         'decisiontreeclassifier__min_samples_leaf': range(3, 10, 2),
                         'decisiontreeclassifier__min_samples_split': range(3, 40, 5),
                         'simpleimputer__strategy': ['mean', 'constant'],
                         'standardscaler__with_mean': [True, False]},
             scoring='accuracy', verbose=1)

In [7]:
search.best_score_

0.9289476125202264

In [8]:
search.best_params_

{'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 5,
 'decisiontreeclassifier__min_samples_leaf': 9,
 'decisiontreeclassifier__min_samples_split': 28,
 'simpleimputer__strategy': 'mean',
 'standardscaler__with_mean': True}

In [9]:
y_train_pred = search.predict(X_train)
y_test_pred = search.predict(X_test)

In [10]:
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_true = y_train, y_pred = y_train_pred)
test_accuracy = accuracy_score(y_true = y_test, y_pred = y_test_pred)

print(f"The training accuracy is {train_accuracy} and the test accuracy is {test_accuracy}.")

The training accuracy is 0.9340753424657534 and the test accuracy is 0.9383561643835616.
