##Data Preprocessing

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [27]:
data = pd.read_csv('/content/datasets_4458_8204_winequality-red.csv')

In [28]:
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [51]:
data['quality'].unique()

array([5, 6, 7, 4, 8, 3])

##Separate Features and Target
* Here we are considering 'quality' as target
* and rest of the columns as features

In [29]:
x = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

##Splitting Data into Train and Test sets

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

##Standardization

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
sc = StandardScaler()

In [34]:
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

##Train on different algorithms
* we have trained on 5 algorithms
* for 1.LogisticRegression class we have used 1.linear_model library
* for 2.KNeighborsClassifier class we have used 2.neighbors library
* for 3.DecisionTreeClassifier class we have used 3.tree library
* for 4.RandomForestClassifier class we have used 4.ensemble library
* for 5.SVC class we have used 5.svm library

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [36]:
l_cla = LogisticRegression()
k_cla = KNeighborsClassifier()
d_cla = DecisionTreeClassifier()
r_cla = RandomForestClassifier()
s_cla = SVC(kernel='linear')
ks_cla = SVC(kernel='rbf')

In [37]:
l_cla.fit(x_train, y_train)
k_cla.fit(x_train, y_train)
d_cla.fit(x_train, y_train)
r_cla.fit(x_train, y_train)
s_cla.fit(x_train, y_train)
ks_cla.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [38]:
l_pred = l_cla.predict(x_test)
k_pred = k_cla.predict(x_test)
d_pred = d_cla.predict(x_test)
r_pred = r_cla.predict(x_test)
s_pred = s_cla.predict(x_test)
ks_pred = ks_cla.predict(x_test)

In [39]:
from sklearn.metrics import confusion_matrix

In [40]:
l_c = confusion_matrix(y_test, l_pred)
k_c = confusion_matrix(y_test, k_pred)
d_c = confusion_matrix(y_test, d_pred)
r_c = confusion_matrix(y_test, r_pred)
s_c = confusion_matrix(y_test, s_pred)
ks_c = confusion_matrix(y_test, ks_pred)

In [41]:
l_c

array([[  0,   3,   6,   1,   0],
       [  0, 104,  32,   0,   0],
       [  0,  52,  71,   6,   0],
       [  0,   3,  26,  10,   0],
       [  0,   0,   3,   3,   0]])

In [42]:
k_c

array([[ 0,  0,  0,  0,  0,  0],
       [ 0,  0,  3,  7,  0,  0],
       [ 1,  2, 88, 42,  3,  0],
       [ 0,  0, 47, 66, 16,  0],
       [ 0,  0, 11, 17, 11,  0],
       [ 0,  0,  0,  3,  3,  0]])

In [43]:
d_c

array([[ 0,  0,  0,  0,  0,  0],
       [ 1,  0,  4,  4,  1,  0],
       [ 1,  5, 98, 30,  2,  0],
       [ 0,  5, 29, 80, 13,  2],
       [ 0,  0,  5, 10, 21,  3],
       [ 0,  0,  1,  3,  1,  1]])

In [44]:
r_c

array([[  0,   0,   0,   0,   0,   0],
       [  1,   0,   4,   5,   0,   0],
       [  0,   0, 111,  24,   1,   0],
       [  0,   0,  36,  85,   8,   0],
       [  0,   0,   1,  20,  18,   0],
       [  0,   0,   0,   3,   2,   1]])

In [45]:
s_c

array([[  0,   4,   6,   0,   0],
       [  0, 107,  29,   0,   0],
       [  0,  55,  74,   0,   0],
       [  0,   3,  36,   0,   0],
       [  0,   0,   6,   0,   0]])

In [46]:
ks_c

array([[  0,   5,   5,   0,   0],
       [  0, 105,  31,   0,   0],
       [  0,  46,  79,   4,   0],
       [  0,   2,  24,  13,   0],
       [  0,   0,   4,   2,   0]])

##Result

In [47]:
from sklearn.metrics import accuracy_score

In [48]:
l_a = accuracy_score(y_test, l_pred)
k_a = accuracy_score(y_test, k_pred)
d_a = accuracy_score(y_test, d_pred)
r_a = accuracy_score(y_test, r_pred)
s_a = accuracy_score(y_test, s_pred)
ks_a = accuracy_score(y_test, ks_pred)

In [49]:
print('Logistic Regression: ' + str(l_a) + '\nKNN: ' + str(k_a) + '\nDecision Tree: ' + str(d_a) + '\nRandom Forest: ' + str(r_a) + '\nLinear SVC: ' + str(s_a) + '\nKernel SVC: ' + str(ks_a))

Logistic Regression: 0.578125
KNN: 0.515625
Decision Tree: 0.625
Random Forest: 0.671875
Linear SVC: 0.565625
Kernel SVC: 0.615625


##Conclusion
Since random forest is giving a better efficiency than compared to others, we are considering the best algorithm for this dataset(using classification) to be random forest.