use classification on wine dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
data.columns = ["Type", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids","Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"] 
data.head()

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
X = data[["Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids","Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]] 
y = data[["Type"]]

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33, random_state=42)

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [7]:
best_score = 0
parameters = {}

In [8]:
for depth in range (1,11):
    for min_split in range(2,21):
        for min_leaf in range(1,21):
            for criterion in ["gini", "entropy"]:
                clf = DecisionTreeClassifier(max_depth = depth, min_samples_split = min_split, 
                                             min_samples_leaf = min_leaf, criterion = criterion)
                scores = cross_val_score(clf, X_train, y_train, cv = 5)
                if scores.mean() > best_score:
                    parameters["best_depth"] = depth
                    parameters["best_min_split"] = min_split
                    parameters["best_min_leaf"] = min_leaf
                    parameters["best_criterion"] = criterion
                    best_score = scores.mean()
                    print("Accuracy: {}".format(scores.mean()))
                    print("max_depth: {}, min_samples_split: {}, min_samples_leaf: {}, criterion: {}".format(depth, 
                                                                                                             min_split, 
                                                                                                             min_leaf, 
                                                                                                             criterion))

Accuracy: 0.6564650856389986
max_depth: 1, min_samples_split: 2, min_samples_leaf: 1, criterion: gini
Accuracy: 0.8979117259552043
max_depth: 2, min_samples_split: 2, min_samples_leaf: 1, criterion: gini
Accuracy: 0.9004888010540185
max_depth: 2, min_samples_split: 2, min_samples_leaf: 1, criterion: entropy
Accuracy: 0.9070026350461132
max_depth: 2, min_samples_split: 2, min_samples_leaf: 3, criterion: gini
Accuracy: 0.9091844532279316
max_depth: 2, min_samples_split: 4, min_samples_leaf: 2, criterion: entropy
Accuracy: 0.9236982872200263
max_depth: 3, min_samples_split: 2, min_samples_leaf: 1, criterion: gini
Accuracy: 0.9327891963109355
max_depth: 3, min_samples_split: 2, min_samples_leaf: 3, criterion: gini
Accuracy: 0.9342753623188406
max_depth: 4, min_samples_split: 2, min_samples_leaf: 1, criterion: entropy


In [9]:
parameters

{'best_criterion': 'entropy',
 'best_depth': 4,
 'best_min_leaf': 1,
 'best_min_split': 2}

In [10]:
clf_dtc = DecisionTreeClassifier(criterion = parameters["best_criterion"], max_depth = parameters['best_depth'], 
                                 min_samples_split = parameters["best_min_split"], 
                                 min_samples_leaf = parameters["best_min_leaf"])

In [11]:
clf_dtc.fit(X = X_train, y = y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [12]:
prediction_dtc = clf_dtc.predict(X_test)

In [13]:
results = pd.DataFrame(y_test)
results["y_dtc"] = prediction_dtc

In [14]:
results

Unnamed: 0,Type,y_dtc
19,1,1
45,1,1
140,3,2
30,1,1
67,2,2
16,1,1
119,2,2
174,3,3
109,2,2
141,3,1


In [24]:
accuracy_score(y_true = results["Type"], y_pred = results["y_dtc"])

0.8813559322033898