In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [17]:
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header=None)

In [18]:
data.head() #Using chemical analysis determine the origin of wines

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [19]:
y = data[0]
features = [1,2,3,4,5,6,7,8,9,10,11,12,13]
X = data[features]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
results = []

In [22]:
skf = StratifiedKFold(n_splits = 5, random_state = 42)

for depth in range (1,11):
    classifier = DecisionTreeClassifier(criterion = "entropy", max_depth = depth)
    scores = cross_val_score(classifier, X_train, y_train, cv = skf)
    results.append(scores.mean())
    print("Accuracy: {} (std {})".format(scores.mean(), scores.std()))     

Accuracy: 0.6125615763546798 (std 0.023542469251918295)
Accuracy: 0.9007389162561577 (std 0.042373123198084534)
Accuracy: 0.8731527093596061 (std 0.056284964974201636)
Accuracy: 0.8938423645320197 (std 0.03959622948592828)
Accuracy: 0.8455665024630543 (std 0.06300606229982669)
Accuracy: 0.8869458128078819 (std 0.041481810651347185)
Accuracy: 0.8938423645320197 (std 0.03959622948592828)
Accuracy: 0.9007389162561577 (std 0.042373123198084534)
Accuracy: 0.9078817733990148 (std 0.043439226965613045)
Accuracy: 0.9078817733990148 (std 0.043439226965613045)


In [23]:
results = pd.Series(results, index = range(1,11)) 

In [24]:
results.max()  

0.9078817733990148

In [25]:
results[results == results.max()] 

9     0.907882
10    0.907882
dtype: float64

In [26]:
best_depth = results[results == results.max()].head(1).index.item()

In [27]:
classifier = DecisionTreeClassifier(criterion = "entropy", max_depth = best_depth)

In [28]:
classifier.fit(X = X_train, y = y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=9,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [29]:
predicted = classifier.predict(X_test)

In [30]:
wyniki = pd.DataFrame({"y_test":y_test,"y_pred":predicted}, index = y_test.index) 

In [31]:
wyniki

Unnamed: 0,y_pred,y_test
19,1,1
45,1,1
140,2,3
30,1,1
67,2,2
16,1,1
119,2,2
174,3,3
109,2,2
141,1,3


In [32]:
accuracy_score(y_true = wyniki["y_test"], y_pred = wyniki["y_pred"])      #accuracy

0.9166666666666666