In [1]:
from sklearn.datasets import load_wine

dataset = load_wine()
data = dataset['data']
labels = dataset['target']
attributes = dataset['feature_names']

In [10]:
import numpy as np
print(f"There are {data.shape[0]} records")

mask = np.isnan(data)
mask = np.sum(mask, axis=0)
print(f"Rows with missing values: {[i for i, v in enumerate(mask) if v is True]}")

distinct_labels = set(labels)
for lab in distinct_labels:
    print(f"Label {lab} contains {len(labels[labels == lab])} elements")
    

There are 178 records
Rows with missing values: []
Label 0 contains 59 elements
Label 1 contains 71 elements
Label 2 contains 48 elements


TRAINING with full dataset

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

classifier = DecisionTreeClassifier()
classifier.fit(data, labels)
dot_code = export_graphviz(classifier, feature_names=attributes)
# Visualize the graph obtained by paste the string in the dot_code variable on http://www.webgraphviz.com/
print(dot_code)

TEST with full dataset

In [16]:
from sklearn.metrics import accuracy_score
print(labels)
labels_predicted = classifier.predict(data)
print(labels_predicted)

# ACCURACY WILL BE = 1.0 cause we trained and tested the same dataset (training_set = test_set)
accuracy_score(labels, labels_predicted)


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


1.0

TRAINING AND TEST splitting initial dataset

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(data, labels)

clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_predicted = clf.predict(X_test)
print(accuracy_score(y_test, y_predicted))
print(classification_report(y_test, y_predicted))

0.9333333333333333
              precision    recall  f1-score   support

           0       1.00      0.86      0.92        14
           1       0.87      1.00      0.93        20
           2       1.00      0.91      0.95        11

    accuracy                           0.93        45
   macro avg       0.96      0.92      0.94        45
weighted avg       0.94      0.93      0.93        45



NON DEFAULT CLASSIFIER

In [39]:
from sklearn.model_selection import ParameterGrid
params = {
"max_depth": [None, 2, 4, 8],
"splitter": ["best", "random"],
"min_impurity_decrease": [0.0, 0.01, 0.05]
}

accuracies = []
for config in ParameterGrid(params):
    print(config)
    clf = DecisionTreeClassifier(**config)
    clf.fit(X_train,y_train)
    y_predicted = clf.predict(X_test)
    acc = accuracy_score(y_test, y_predicted)
    accuracies.append(acc)
    print(acc, '\n')

max(accuracies)

{'max_depth': None, 'min_impurity_decrease': 0.0, 'splitter': 'best'}
0.9555555555555556 

{'max_depth': None, 'min_impurity_decrease': 0.0, 'splitter': 'random'}
0.9555555555555556 

{'max_depth': None, 'min_impurity_decrease': 0.01, 'splitter': 'best'}
0.9111111111111111 

{'max_depth': None, 'min_impurity_decrease': 0.01, 'splitter': 'random'}
0.9333333333333333 

{'max_depth': None, 'min_impurity_decrease': 0.05, 'splitter': 'best'}
0.9111111111111111 

{'max_depth': None, 'min_impurity_decrease': 0.05, 'splitter': 'random'}
0.7555555555555555 

{'max_depth': 2, 'min_impurity_decrease': 0.0, 'splitter': 'best'}
0.9111111111111111 

{'max_depth': 2, 'min_impurity_decrease': 0.0, 'splitter': 'random'}
0.8444444444444444 

{'max_depth': 2, 'min_impurity_decrease': 0.01, 'splitter': 'best'}
0.9111111111111111 

{'max_depth': 2, 'min_impurity_decrease': 0.01, 'splitter': 'random'}
0.7333333333333333 

{'max_depth': 2, 'min_impurity_decrease': 0.05, 'splitter': 'best'}
0.9111111111111111

0.9777777777777777

CROSS VALIDATION (K-FOLD)

In [52]:
from sklearn.model_selection import KFold
# Split the datasets into two:
# - X_train_valid: the dataset used for the k-fold cross-validation
# - X_test: the dataset used for the final testing (this will NOT
# be seen by the classifier during the training/validation phases)
X_train_valid, X_test, y_train_valid, y_test = train_test_split(data, labels)
kf = KFold(5) # 5-fold cross-validation
# X and y are the arrays to be split
accuracies = []
for config in ParameterGrid(params):
    clf_acc = []
    count = []
    for train_indices, validation_indices in kf.split(X_train_valid):
        X_train = X_train_valid[train_indices]
        X_valid = X_train_valid[validation_indices]
        y_train = y_train_valid[train_indices]
        y_valid = y_train_valid[validation_indices]
        
        #I keep tracks of how many elements have been used in each split
        count.append(len(train_indices))
        
        clf = DecisionTreeClassifier(**config)
        clf.fit(X_train, y_train)
        acc = accuracy_score(y_valid, clf.predict(X_valid))
        clf_acc.append(acc)
    accuracies.append(np.average(clf_acc, weights=count))
    
best_config = list(ParameterGrid(params))[int(np.argmax(accuracies))]
clf = DecisionTreeClassifier(**best_config)
clf.fit(X_train_valid, y_train_valid)
accuracy_score(y_test, clf.predict(X_test))



0.9555555555555556