In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_csv("dataset/wine-quality.csv")
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [6]:
# check if any null value is present

dataset.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [7]:
# check if any NaN values is present
dataset.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

##### Seperate the predictors and target value

In [11]:
X = dataset.drop('quality', axis=1)
y = dataset['quality']

#### Split the dataset

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [14]:
clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [15]:
X_test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
4656,6.0,0.29,0.41,10.8,0.048,55.0,149.0,0.9937,3.09,0.59,10.966667
3659,5.4,0.53,0.16,2.7,0.036,34.0,128.0,0.98856,3.2,0.53,13.2
907,7.1,0.25,0.39,2.1,0.036,30.0,124.0,0.9908,3.28,0.43,12.2
4352,7.3,0.28,0.35,1.6,0.054,31.0,148.0,0.99178,3.18,0.47,10.7
3271,6.5,0.32,0.34,5.7,0.044,27.0,91.0,0.99184,3.28,0.6,12.0


In [16]:
predictions = clf.predict(X_test)
predictions

array([7, 8, 9, 5, 7, 6, 5, 6, 6, 5, 7, 5, 7, 5, 8, 5, 6, 6, 7, 6, 5, 6,
       5, 5, 6, 5, 5, 6, 7, 5, 5, 5, 6, 6, 5, 7, 6, 6, 5, 6, 6, 6, 6, 4,
       6, 6, 5, 4, 5, 5, 5, 6, 5, 6, 6, 6, 6, 5, 6, 7, 6, 7, 6, 7, 5, 7,
       6, 4, 6, 6, 6, 6, 6, 5, 6, 6, 6, 7, 8, 7, 6, 5, 6, 6, 6, 6, 5, 7,
       5, 6, 7, 5, 5, 6, 6, 8, 6, 7, 6, 6, 6, 6, 7, 5, 6, 5, 7, 7, 6, 6,
       6, 7, 6, 5, 5, 7, 6, 6, 5, 8, 6, 7, 7, 6, 7, 4, 7, 6, 6, 5, 6, 6,
       6, 8, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 6, 6, 5, 7, 6, 6, 6,
       7, 5, 6, 6, 7, 5, 5, 5, 7, 5, 7, 6, 5, 6, 5, 7, 6, 6, 6, 5, 6, 7,
       6, 6, 7, 6, 6, 5, 7, 6, 5, 6, 7, 6, 6, 6, 6, 5, 7, 6, 5, 5, 7, 6,
       6, 5, 7, 4, 6, 7, 5, 6, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 6, 5, 7, 6,
       5, 6, 6, 6, 6, 6, 6, 5, 7, 5, 7, 5, 6, 5, 5, 7, 6, 6, 6, 7, 7, 7,
       6, 5, 7, 5, 4, 5, 4, 6, 6, 4, 5, 5, 7, 6, 7, 5, 6, 5, 6, 5, 5, 5,
       7, 6, 6, 6, 7, 4, 5, 6, 6, 7, 6, 6, 5, 5, 7, 7, 6, 6, 6, 5, 6, 7,
       6, 5, 6, 5, 6, 6, 6, 6, 8, 5, 7, 5, 5, 5, 5,

In [17]:
clf.predict_proba(X_test)

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.6193877551020408

In [20]:
from sklearn.metrics import recall_score
recall_score(y_test, predictions)

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].