# Importing libraries

In [2]:
from sklearn.datasets import make_moons

In [3]:
import pandas as pd

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.model_selection  import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Loading dataset

In [6]:
X,y=make_moons(n_samples=10000,noise=0.4,random_state=42)

In [7]:
X

array([[ 0.9402914 ,  0.12230559],
       [ 0.12454026, -0.42477546],
       [ 0.26198823,  0.50841438],
       ...,
       [-0.24177973,  0.20957199],
       [ 0.90679645,  0.54958215],
       [ 2.08837082, -0.05050728]])

In [8]:
X.shape

(10000, 2)

In [9]:
y.shape

(10000,)

# Splitting data into training and testing datasets

In [10]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [11]:
X_test.shape

(2000, 2)

# making dictionary of hyperparameters and doing hyperparameter tuning


In [12]:
params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv=GridSearchCV(DecisionTreeClassifier(random_state=42),params,verbose=1,cv=3)

# fitting data into model

In [13]:
grid_search_cv.fit(X_train,y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16, 17, 18, 19, 20, 21,
                                            22, 23, 24, 25, 26, 27, 28, 29, 30,
                                            31, ...],
                         'min_samples_split': [2, 3, 4]},
             verbose=1)

 # some callable functions of gridsearchcv object

In [14]:
dir(grid_search_cv)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits

# watching out for all the results

In [15]:
grid_search_cv.cv_results_

{'mean_fit_time': array([0.0047853 , 0.00365639, 0.00397158, 0.00465417, 0.00631618,
        0.00434454, 0.00462357, 0.00500361, 0.00563629, 0.00566673,
        0.00561539, 0.0049967 , 0.00565521, 0.00598558, 0.00533446,
        0.00598399, 0.0063165 , 0.0066479 , 0.00731198, 0.00696548,
        0.00629854, 0.00698129, 0.01547194, 0.00632215, 0.00701356,
        0.00662978, 0.00698137, 0.00598248, 0.00664822, 0.0069805 ,
        0.00598248, 0.00665013, 0.00698113, 0.00732994, 0.00832589,
        0.00731365, 0.00728281, 0.00730189, 0.00776927, 0.00698113,
        0.00698113, 0.00664783, 0.00681774, 0.00799378, 0.00733169,
        0.00729593, 0.00713738, 0.00730069, 0.00763154, 0.00949645,
        0.00733105, 0.00766293, 0.00731937, 0.00705322, 0.00733725,
        0.00727916, 0.00734282, 0.00912786, 0.00796088, 0.00797757,
        0.00797709, 0.00766333, 0.00731548, 0.00829752, 0.00763313,
        0.00732629, 0.00764457, 0.00764656, 0.00930937, 0.00830921,
        0.00766198, 0.00867263,

In [16]:
results_df=pd.DataFrame(grid_search_cv.cv_results_)

In [17]:
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_leaf_nodes,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004785,0.001128,0.000999,0.000816,2,2,"{'max_leaf_nodes': 2, 'min_samples_split': 2}",0.753656,0.782527,0.777194,0.771126,0.012544,292
1,0.003656,0.000469,0.000666,0.000471,2,3,"{'max_leaf_nodes': 2, 'min_samples_split': 3}",0.753656,0.782527,0.777194,0.771126,0.012544,292
2,0.003972,0.000026,0.000000,0.000000,2,4,"{'max_leaf_nodes': 2, 'min_samples_split': 4}",0.753656,0.782527,0.777194,0.771126,0.012544,292
3,0.004654,0.000472,0.000997,0.000002,3,2,"{'max_leaf_nodes': 3, 'min_samples_split': 2}",0.799775,0.826022,0.822206,0.816001,0.011579,289
4,0.006316,0.001879,0.000668,0.000473,3,3,"{'max_leaf_nodes': 3, 'min_samples_split': 3}",0.799775,0.826022,0.822206,0.816001,0.011579,289
...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,0.010970,0.000815,0.001010,0.000016,98,3,"{'max_leaf_nodes': 98, 'min_samples_split': 3}",0.840270,0.847769,0.851463,0.846501,0.004657,281
290,0.010637,0.000459,0.000987,0.000017,98,4,"{'max_leaf_nodes': 98, 'min_samples_split': 4}",0.839895,0.847769,0.851463,0.846376,0.004824,285
291,0.009959,0.000829,0.000666,0.000471,99,2,"{'max_leaf_nodes': 99, 'min_samples_split': 2}",0.839895,0.847769,0.850713,0.846126,0.004567,286
292,0.009658,0.000481,0.000997,0.000814,99,3,"{'max_leaf_nodes': 99, 'min_samples_split': 3}",0.839895,0.847769,0.850713,0.846126,0.004567,286


# best estimator coressponding to best score

In [18]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(max_leaf_nodes=17, random_state=42)

In [19]:
grid_search_cv.best_score_

0.8555001986342105

# HOLA! we got 85.5% accuracy with best estimator 

In [20]:
results_df[results_df["param_max_leaf_nodes"]==17]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_leaf_nodes,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
45,0.007296,0.000447,0.000665,0.00047,17,2,"{'max_leaf_nodes': 17, 'min_samples_split': 2}",0.851894,0.857518,0.857089,0.8555,0.002556,1
46,0.007137,0.000182,0.000646,0.000457,17,3,"{'max_leaf_nodes': 17, 'min_samples_split': 3}",0.851894,0.857518,0.857089,0.8555,0.002556,1
47,0.007301,0.000479,0.001339,0.000465,17,4,"{'max_leaf_nodes': 17, 'min_samples_split': 4}",0.851894,0.857518,0.857089,0.8555,0.002556,1


In [21]:
from sklearn.metrics import accuracy_score
y_pred=grid_search_cv.predict(X_train)
accuracy_score(y_pred,y_train)

0.862875

# making a different model using details we got from best estimator

In [22]:
best_model=DecisionTreeClassifier(max_leaf_nodes=17, random_state=42)

In [23]:
from sklearn.model_selection import cross_val_score,cross_val_predict

In [24]:
cross_val_score(best_model,X_train,y_train,cv=3)

array([0.85189351, 0.85751781, 0.85708927])

In [25]:
cross_val_predict(best_model,X_test,y_test,cv=3)

array([1, 1, 0, ..., 0, 0, 1], dtype=int64)

In [26]:
cross_val_score(best_model,X_test,y_test,cv=3)

array([0.85157421, 0.86656672, 0.85435435])