In [85]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier

raw_data = load_iris()
np_data = np.array(raw_data.data)
columns = raw_data.feature_names
pd_data = pd.DataFrame(np_data, columns=columns)
pd_data["target"] = raw_data.target
pd_data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [89]:
model = DecisionTreeClassifier()
kfold = KFold(n_splits=3)

features = pd_data.iloc[:, :4]
labels = pd_data.iloc[:, 4]

accuracy_kfold = []

for train_index, test_index in kfold.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    Y_train, Y_test = labels.iloc[train_index], labels.iloc[test_index]

    model.fit(X_train, Y_train)
    pred = model.predict(X_test)
    accuracy_kfold.append(np.round(accuracy_score(Y_test, pred), 4))

print(accuracy_kfold)
print(np.array(accuracy_kfold).mean())

    

[0.0, 0.0, 0.0]
0.0


In [94]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
strkfold = StratifiedKFold(n_splits=3)
accuracy_strkfold = []

for train_index, test_index in strkfold.split(features, labels):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    Y_train, Y_test = labels.iloc[train_index], labels.iloc[test_index]
    
    model.fit(X_train, Y_train)
    pred = model.predict(X_test)
    accuracy_strkfold.append(accuracy_score(Y_test, pred))
    
print(np.array(accuracy_strkfold).mean())

0.9733333333333333


In [108]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

# ...

model = DecisionTreeClassifier()
scores = cross_validate(model, features, labels, scoring='accuracy', cv=3)
# cv = n_splits for fold cross validation
# cross_val_score will return list with length 'cv'

for k, v in scores.items():
    print(f"{k}: {v}")

print(f"Average Score: {np.round(np.array(scores["test_score"]).mean(), 4)}")

fit_time: [0.01067424 0.00817251 0.01255178]
score_time: [0.00871754 0.01278663 0.01072049]
test_score: [0.98 0.92 0.98]
Average Score: 0.96


In [110]:
model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

data = load_iris()
pd_data = pd.DataFrame(np.array(data.data), columns=data.feature_names)
pd_data["target"] = np.array(data.target)

X_data = pd_data.iloc[:, :4]
Y_data = pd_data.iloc[:, 4]

model = DecisionTreeClassifier()

# check the hyper parameters of model
print(model.get_params())

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


In [15]:
# set the parameters for selected model
# max_depth * min_sample_split = total 6 validation
parameters = {"max_depth": [1, 2, 3],
              "min_samples_split": [2, 3]}

# define GridSearchCV with Model
grid = GridSearchCV(model, param_grid=parameters, cv=3, refit=True)

In [16]:
# split data into train and test set
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=21)

grid.fit(X_train, Y_train)

In [26]:
score_df = pd.DataFrame(grid.cv_results_)
score_df[["params", "rank_test_score", "split0_test_score", "split1_test_score", "split2_test_score"]].sort_values("rank_test_score")


Unnamed: 0,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score
4,"{'max_depth': 3, 'min_samples_split': 2}",1,0.975,1.0,1.0
5,"{'max_depth': 3, 'min_samples_split': 3}",1,0.975,1.0,1.0
2,"{'max_depth': 2, 'min_samples_split': 2}",3,0.95,0.975,1.0
3,"{'max_depth': 2, 'min_samples_split': 3}",3,0.95,0.975,1.0
0,"{'max_depth': 1, 'min_samples_split': 2}",5,0.7,0.675,0.675
1,"{'max_depth': 1, 'min_samples_split': 3}",5,0.7,0.675,0.675
