In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

In [None]:
# loading data
wine=pd.read_csv('../../Desktop/wine.csv')
wine.head()

In [5]:
wine.shape

(178, 14)

In [7]:
wine.columns

Index(['Grade', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10',
       'P11', 'P12', 'P13'],
      dtype='object')

In [12]:
wine.duplicated().sum()

0

In [13]:
wine.isnull().sum().sum()

0

In [None]:
wine['Grade'].value_counts()

In [None]:
x = wine.values[::, 1:14]
y = wine.values[::, 0:1]

In [20]:
# divided into testing and training data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=17)

In [None]:
# Building KNeighbors classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train, y_train.ravel())

knn_pred = knn.predict(x_test)
# Checking accuracy
accuracy_score(y_test, knn_pred) 

In [None]:
# set up the number of neighbors 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])
knn_params = {'knn__n_neighbors': range(1, 10)}

In [31]:
from sklearn.model_selection import GridSearchCV, cross_val_score

knn_grid = GridSearchCV(knn_pipe, knn_params, cv=5, n_jobs=-1, verbose=True)

knn_grid.fit(x_train, y_train)
knn_grid.best_params_, knn_grid.best_score_ 

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    7.1s finished
  self._final_estimator.fit(Xt, y, **fit_params)


({'knn__n_neighbors': 3}, 0.95161290322580649)

In [35]:
accuracy_score(y_test, knn_grid.predict(x_test))

0.98148148148148151

In [34]:
# Building DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

tree_cl = DecisionTreeClassifier(max_depth=5, random_state=12)
tree_cl=tree_cl.fit(x_train, y_train)

tree_pred = tree_cl.predict(x_test)
accuracy_score(y_test, tree_pred) 

0.88888888888888884

In [40]:
# set up parametres of DecisionTreeClassifier model 
from sklearn.model_selection import GridSearchCV, cross_val_score

tree_params = {'max_depth': range(1,11), 'max_features': range(4,14)}
tree_grid = GridSearchCV(tree_cl, tree_params, cv=5, n_jobs=-1, verbose=True)

tree_grid.fit(x_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    6.4s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=12,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': range(1, 11), 'max_features': range(4, 14)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [42]:
tree_grid.best_params_

{'max_depth': 3, 'max_features': 8}

In [47]:
# estimation of the accuracy (training data)
tree_grid.best_score_ 

0.94354838709677424

In [46]:
accuracy_score(y_test, tree_grid.predict(x_test))

0.90740740740740744