In [134]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA, KernelPCA
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

%matplotlib inline

# 1- Dividing wines into three classes

## 1.1- Exploratory data analysis

### 1.1.1- Red wine

In [3]:
# Read data with read_csv method of the pandas library.
bankdata_red = pd.read_csv("data/new_format/winequality-red-classes.csv", sep = ',')
# see the rows and columns of the data
bankdata_red.shape

(1599, 12)

In [4]:
# get a feel of how our dataset actually looks
bankdata_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,class
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,medium
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,medium
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,medium
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,medium
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,medium


In [5]:
# view some basic statistical details like percentile, mean, std etc.
bankdata_red.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9


### 1.1.2- White wine

In [6]:
# Read data with read_csv method of the pandas library.
bankdata_white = pd.read_csv("data/new_format/winequality-white-classes.csv", sep = ',')
# see the rows and columns of the data
bankdata_white.shape

(4898, 12)

In [7]:
# get a feel of how our dataset actually looks
bankdata_white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,class
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,medium
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,medium
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,medium
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,medium
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,medium


In [8]:
# view some basic statistical details like mean, std etc.
bankdata_white.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2


## 1.2- Data preprocessing

Data preprocessing involves (1) Dividing the data into attributes and labels and (2) dividing the data into training and testing sets

### 1..2.1- Split dataset in train and test

#### 1.2.1.1- Red wine

In [9]:
# In the first line of the script above, all the columns of the bankdata dataframe 
#are being stored in the X variable except the "quality" column, which is the label column. 
#The drop() method drops this column.
X_red = bankdata_red.drop('class', axis=1)
y_red = bankdata_red['class']
y_red.head()

0    medium
1    medium
2    medium
3    medium
4    medium
Name: class, dtype: object

Once the data is divided into attributes and labels, the final preprocessing step is to divide data into training and test sets. 

In [10]:
# divide data into training and test sets.
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X_red, y_red, test_size = 0.20)

X_train_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1568,7.0,0.56,0.13,1.6,0.077,25.0,42.0,0.99629,3.34,0.59,9.2
163,7.4,0.6,0.26,7.3,0.07,36.0,121.0,0.9982,3.37,0.49,9.4
1256,7.5,0.59,0.22,1.8,0.082,43.0,60.0,0.99499,3.1,0.42,9.2
117,7.8,0.56,0.12,2.0,0.082,7.0,28.0,0.997,3.37,0.5,9.4
1266,7.2,0.57,0.05,2.3,0.081,16.0,36.0,0.99564,3.38,0.6,10.3


In [11]:
y_train_red.head()

1568    medium
163     medium
1256    medium
117     medium
1266    medium
Name: class, dtype: object

#### 1.2.1.2- White wine

In [12]:
# In the first line of the script above, all the columns of the bankdata dataframe 
#are being stored in the X variable except the "quality" column, which is the label column. 
#The drop() method drops this column.
X_white = bankdata_white.drop('class', axis=1)
y_white = bankdata_white['class']

In [13]:
# divide data into training and test sets.
X_train_white, X_test_white, y_train_white, y_test_white = train_test_split(X_white, y_white, test_size = 0.20)

In [14]:
X_train_white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
2532,6.7,0.54,0.27,7.1,0.049,8.0,178.0,0.99502,3.16,0.38,9.4
4285,6.0,0.2,0.26,6.8,0.049,22.0,93.0,0.9928,3.15,0.42,11.0
4264,5.8,0.2,0.16,1.4,0.042,44.0,99.0,0.98912,3.23,0.37,12.2
4470,4.7,0.145,0.29,1.0,0.042,35.0,90.0,0.9908,3.76,0.49,11.3
3314,6.6,0.435,0.38,9.2,0.058,66.0,243.0,0.99833,3.23,0.54,9.1


In [81]:
y_train_white.head()

2532       low
4285    medium
4264    medium
4470    medium
3314    medium
Name: class, dtype: object

### 1.2.2- Data standarization

Standarization of datasets is a common requirement for many machine learning estimators. They might behave badly if the individual features do not more or less look like standard normally distributed data: Gaussian with zero mean and unit variance.

Here we are going to use StandardScaler, that standardize features by removing the mean and scaling to unit variance. The standard score of a sample $x$ is calculated as:

$x_{scaled} = \dfrac{x - \mu}{\sigma}$

where $\mu$ is the mean of the training samples and $\sigma$ is the standard deviation of the training samples.

#### 1.2.2.1- Red wine

In [82]:
X_train_red_scaled = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X_train_red), columns=X_train_red.columns.values)
X_train_red_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.76065,0.193844,-0.752391,-0.690005,-0.220919,0.862306,-0.143972,-0.238748,0.176682,-0.399161,-1.135697
1,-0.530589,0.416275,-0.079593,3.621041,-0.359835,1.909142,2.256528,0.771182,0.367595,-0.975832,-0.949581
2,-0.473074,0.360668,-0.286608,-0.53874,-0.121693,2.575311,0.402978,-0.926134,-1.350616,-1.379502,-1.135697
3,-0.300528,0.193844,-0.804145,-0.387476,-0.121693,-0.850699,-0.569376,0.136671,0.367595,-0.918165,-0.949581
4,-0.645619,0.249452,-1.166421,-0.160578,-0.141538,0.005804,-0.326288,-0.582441,0.431232,-0.341494,-0.11206


In [86]:
X_train_red_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0
mean,-2.451345e-16,1.916632e-16,1.111091e-16,3.402716e-17,7.777637e-17,-7.916524000000001e-17,-8.333183e-18,6.00753e-15,-6.416551e-16,7.222092e-17,-7.645695e-16
std,1.000391,1.000391,1.000391,1.000391,1.000391,1.000391,1.000391,1.000391,1.000391,1.000391,1.000391
min,-2.141018,-2.252896,-1.425189,-1.219432,-1.510855,-1.4217,-1.23787,-3.527628,-3.641564,-1.667838,-1.88016
25%,-0.7031348,-0.7514869,-0.9076521,-0.4631079,-0.3598354,-0.7555316,-0.7516929,-0.6035913,-0.6506045,-0.6298296,-0.8565229
50%,-0.2430123,-0.02858646,-0.07959305,-0.2362107,-0.1812288,-0.1845301,-0.2655158,0.009769017,-0.01423011,-0.2261597,-0.2981757
75%,0.5046867,0.6109024,0.748466,0.06631878,0.03706812,0.4816384,0.4941359,0.5728972,0.5585068,0.4081786,0.6324031
max,4.358213,5.865833,3.75018,9.747263,10.3764,5.335151,7.027141,3.674068,4.44039,7.731903,4.168602


In [87]:
X_test_red_scaled = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X_test_red), columns=X_test_red.columns.values)
X_test_red_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.805556,0.835775,-1.268633,-0.084588,0.671518,0.622765,-0.256001,-0.089522,1.076303,-0.549816,0.162323
1,-0.633933,0.350751,-1.068355,-0.084588,-0.292841,0.232393,1.300607,0.365364,1.567525,-0.680919,-0.908339
2,0.395806,-0.505174,1.184767,-0.026125,-0.026811,0.525172,0.903825,1.034314,0.304382,-0.549816,-0.713673
3,0.681845,-0.448112,-0.066968,0.032337,0.239219,-0.157979,-0.042349,0.659702,-1.309634,0.564565,-0.421674
4,-0.061856,0.921368,-0.117037,-0.3769,0.471996,-0.157979,1.086955,0.28509,-0.818411,0.302358,-1.005671


In [88]:
X_test_red_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0
mean,-2.720046e-16,-1.026956e-16,-7.494005000000001e-17,-1.387779e-16,4.1633360000000003e-17,-1.665335e-17,-7.771561000000001e-17,1.062095e-13,3.633205e-15,1.137979e-16,-2.720046e-16
std,1.001566,1.001566,1.001566,1.001566,1.001566,1.001566,1.001566,1.001566,1.001566,1.001566,1.001566
min,-1.835295,-2.159962,-1.268633,-0.7861364,-1.52323,-1.231501,-1.202175,-2.979388,-2.572776,-2.123058,-1.395003
25%,-0.6911408,-0.847544,-0.8680777,-0.4353621,-0.4591097,-0.8411291,-0.7138269,-0.6286963,-0.7482367,-0.6153673,-0.9083386
50%,-0.2906867,0.008380931,-0.1420718,-0.2599749,-0.1930796,-0.2555715,-0.2407401,-0.02530305,0.02368393,-0.2220566,-0.2270086
75%,0.5102214,0.6360592,0.8843503,-0.02612538,0.2142789,0.6227649,0.3849553,0.6061862,0.6552553,0.4334611,0.6489871
max,3.256192,3.375019,2.636778,7.515522,9.151227,5.11204,7.435475,3.12144,2.690319,6.136466,3.47164


#### 1.2.2.2- White wine

In [89]:
X_train_white_scaled = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X_train_white), columns=X_train_white.columns.values)
X_train_white_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.181563,2.567784,-0.520425,0.125102,0.131076,-1.609709,0.927951,0.312904,-0.184966,-0.958293,-0.893041
1,-1.01966,-0.759337,-0.603302,0.066489,0.131076,-0.788518,-1.087502,-0.421648,-0.251709,-0.604462,0.410855
2,-1.259116,-0.759337,-1.432073,-0.988543,-0.178855,0.501927,-0.945235,-1.639284,0.282236,-1.046751,1.388777
3,-2.576124,-1.297548,-0.354671,-1.066694,-0.178855,-0.025982,-1.158636,-1.083406,3.819622,0.014743,0.655335
4,-0.301291,1.540291,0.391223,0.535393,0.52956,1.792371,2.46918,1.408115,0.282236,0.457032,-1.137521


In [90]:
X_train_white_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0
mean,2.298655e-16,-1.994888e-16,-2.956061e-16,5.0325580000000003e-17,-3.527324e-16,-6.6194e-17,-2.597888e-16,3.431751e-14,-9.879228e-16,-4.4884970000000007e-17,2.466407e-16
std,1.000128,1.000128,1.000128,1.000128,1.000128,1.000128,1.000128,1.000128,1.000128,1.000128,1.000128
min,-3.533949,-1.933615,-2.758106,-1.144844,-1.507134,-1.961649,-3.055533,-2.304351,-3.121664,-2.373619,-2.033949
25%,-0.6604754,-0.6614804,-0.520425,-0.9299302,-0.4445111,-0.6712044,-0.7081229,-0.769071,-0.652168,-0.6929199,-0.8930407
50%,-0.06183505,-0.1721979,-0.1889166,-0.2265752,-0.1345794,-0.08463881,-0.1153425,-0.09076843,-0.05147978,-0.1621727,-0.1595994
75%,0.5368053,0.414941,0.391223,0.6916937,0.1753522,0.6192399,0.6908387,0.7033419,0.6159516,0.5454901,0.7368289
max,8.798042,8.047747,10.99949,11.59369,13.28103,14.87278,7.140289,14.85836,4.153338,5.233756,3.018646


In [91]:
X_test_white_scaled = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X_test_white), columns=X_test_white.columns.values)
X_test_white_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-1.216984,-0.219325,-0.407245,-1.050921,-0.145714,0.01374,-0.809026,-1.191671,1.090126,-0.13463,0.654225
1,0.493309,-0.324687,0.165469,1.433444,0.285379,0.01374,1.456933,1.66861,-0.967681,-0.303997,-1.359829
2,1.063406,-0.113963,0.001837,-0.92773,0.069833,-0.461839,0.404063,-0.399485,-0.903375,-0.388681,-0.070834
3,0.835367,4.100521,-1.634491,-0.948262,1.578659,-0.580733,0.724502,-0.048961,-0.903375,0.119421,-1.03758
4,1.063406,0.412847,-0.07998,-0.763474,-0.415147,-0.996864,-0.809026,-0.925272,0.125529,1.304994,0.895911


In [92]:
X_test_white_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0
mean,2.537653e-17,-3.588966e-16,-6.525392000000001e-17,-1.522592e-16,-3.443957e-16,-7.975480000000001e-17,2.356392e-16,1.770556e-14,7.757967e-16,7.069175e-17,-1.700227e-15
std,1.000511,1.000511,1.000511,1.000511,1.000511,1.000511,1.000511,1.000511,1.000511,1.000511,1.000511
min,-3.497374,-1.852438,-2.77992,-1.133049,-1.923973,-1.888573,-2.914766,-2.32036,-2.703956,-2.251724,-2.084888
25%,-0.6468862,-0.6407736,-0.570878,-0.9071978,-0.4825055,-0.7590749,-0.7174721,-0.7640309,-0.6461488,-0.7274162,-0.8764559
50%,-0.07678865,-0.1139631,-0.1617961,-0.2296435,-0.1457139,-0.04570767,-0.07659466,-0.1523655,-0.06739056,-0.1346299,-0.07083441
75%,0.6073284,0.4128474,0.492735,0.6121663,0.2853793,0.6676596,0.6329482,0.7143065,0.5756741,0.5428402,0.6542249
max,3.913894,4.363926,4.66537,3.260787,8.422263,4.977587,5.267865,2.509869,4.048224,4.353609,2.668279


## 1.4- SVM

In the case of the simple SVM we used "linear" as the value for the kernel parameter. However, for kernel SVM you can use Gaussian, polynomial, sigmoid, or computable kernel. We will implement linear, polynomial, Gaussian, and sigmoid kernels to see which one works better for our problem.

### 1.4.1- Linear kernel
#### 1.4.1.1- Training the algorithm

We have divided the data into training and testing sets. Now is the time to train our SVM on the training data. 

#### 1.4.1.1.1- Red wine

In [96]:
#  SVC class is called to train the algorithm on the training data
red_linear_svclassifier = SVC(kernel='linear')
red_linear_svclassifier.fit(X_train_red_scaled, y_train_red)

SVC(kernel='linear')

In [97]:
accuracy_score(y_test_red, red_linear_svclassifier.predict(X_test_red_scaled))

0.953125

##### 1.4.1.1.2- White wine

In [98]:
#  SVC class is called to train the algorithm on the training data
white_linear_svclassifier = SVC(kernel='linear')
white_linear_svclassifier.fit(X_train_white_scaled, y_train_white)

SVC(kernel='linear')

In [99]:
accuracy_score(y_test_white, white_linear_svclassifier.predict(X_test_white_scaled))

0.9275510204081633

#### 1.4.1.2- Making predictions

##### 1.4.1.2.1- Red wine

In [100]:
y_pred_red_linear = red_linear_svclassifier.predict(X_test_red_scaled)

##### 1.4.1.2.2- White wine

In [101]:
y_pred_white_linear = white_linear_svclassifier.predict(X_test_white_scaled)

#### 1.4.1.3- Evaluating the algorithm

##### 1.4.1.3.1- Red wine

In [103]:
print(confusion_matrix(y_test_red, y_pred_red_linear))
print(classification_report(y_test_red, y_pred_red_linear))
print(accuracy_score(y_test_red, y_pred_red_linear))

[[  0   0   2]
 [  0   0  13]
 [  0   0 305]]
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         2
         low       0.00      0.00      0.00        13
      medium       0.95      1.00      0.98       305

    accuracy                           0.95       320
   macro avg       0.32      0.33      0.33       320
weighted avg       0.91      0.95      0.93       320

0.953125


  _warn_prf(average, modifier, msg_start, len(result))


##### 1.4.1.3.2- White wine

In [104]:
print(confusion_matrix(y_test_white, y_pred_white_linear))
print(classification_report(y_test_white, y_pred_white_linear))
print(accuracy_score(y_test_white, y_pred_white_linear))

[[  0   0  30]
 [  0   0  41]
 [  0   0 909]]
              precision    recall  f1-score   support

        high       0.00      0.00      0.00        30
         low       0.00      0.00      0.00        41
      medium       0.93      1.00      0.96       909

    accuracy                           0.93       980
   macro avg       0.31      0.33      0.32       980
weighted avg       0.86      0.93      0.89       980

0.9275510204081633


### 1.4.2- Polynomial kernel

#### 1.4.2.1- Training the algorithm

##### 1.4.2.1.1- Red wine

In [105]:
red_poly_svclassifier = SVC(kernel='poly', degree=8)
red_poly_svclassifier.fit(X_train_red_scaled, y_train_red)

SVC(degree=8, kernel='poly')

##### 1.4.2.1.2- White wine

In [106]:
white_poly_svclassifier = SVC(kernel='poly', degree=8)
white_poly_svclassifier.fit(X_train_white_scaled, y_train_white)

SVC(degree=8, kernel='poly')

#### 1.4.2.2- Making predictions

##### 1.4.2.2.1- Red wine

In [107]:
y_pred_red_poly = red_poly_svclassifier.predict(X_test_red_scaled)

##### 1.4.2.2.2- White wine

In [108]:
y_pred_white_poly = white_poly_svclassifier.predict(X_test_white_scaled)

#### 1.4.2.3- Evaluating the algorithm

##### 1.4.2.3.1- Red wine

In [109]:
print(confusion_matrix(y_test_red, y_pred_red_poly))
print(classification_report(y_test_red, y_pred_red_poly))
print(accuracy_score(y_test_red, y_pred_red_poly))

[[  0   0   2]
 [  0   3  10]
 [  2   6 297]]
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         2
         low       0.33      0.23      0.27        13
      medium       0.96      0.97      0.97       305

    accuracy                           0.94       320
   macro avg       0.43      0.40      0.41       320
weighted avg       0.93      0.94      0.93       320

0.9375


##### 1.4.2.3.2- White wine

In [110]:
print(confusion_matrix(y_test_white, y_pred_white_poly))
print(classification_report(y_test_white, y_pred_white_poly))
print(accuracy_score(y_test_white, y_pred_white_poly))

[[  7   0  23]
 [  1   5  35]
 [  8   9 892]]
              precision    recall  f1-score   support

        high       0.44      0.23      0.30        30
         low       0.36      0.12      0.18        41
      medium       0.94      0.98      0.96       909

    accuracy                           0.92       980
   macro avg       0.58      0.45      0.48       980
weighted avg       0.90      0.92      0.91       980

0.9224489795918367


### 1.4.3- Gaussian kernel

#### 1.4.3.1- Training the algorithm

##### 1.4.3.1.1- Red wine

In [111]:
red_gaussian_svclassifier = SVC(kernel='rbf')
red_gaussian_svclassifier.fit(X_train_red_scaled, y_train_red)

SVC()

##### 1.4.3.1.2- White wine

In [112]:
white_gaussian_svclassifier = SVC(kernel='rbf')
white_gaussian_svclassifier.fit(X_train_white_scaled, y_train_white)

SVC()

#### 1.4.3.2- Making prediction

##### 1.4.3.2.1- Red wine

In [113]:
y_pred_red_gaussian= red_gaussian_svclassifier.predict(X_test_red_scaled)

##### 1.4.3.2.2- White wine

In [114]:
y_pred_white_gaussian= white_gaussian_svclassifier.predict(X_test_white_scaled)

#### 1.4.3.3- Evaluating the algorithm

##### 1.4.3.3.1- Red wine

In [115]:
print(confusion_matrix(y_test_red, y_pred_red_gaussian))
print(classification_report(y_test_red, y_pred_red_gaussian))
print(accuracy_score(y_test_red, y_pred_red_gaussian))

[[  0   0   2]
 [  0   0  13]
 [  0   0 305]]
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         2
         low       0.00      0.00      0.00        13
      medium       0.95      1.00      0.98       305

    accuracy                           0.95       320
   macro avg       0.32      0.33      0.33       320
weighted avg       0.91      0.95      0.93       320

0.953125


  _warn_prf(average, modifier, msg_start, len(result))


##### 1.4.3.3.2- White wine

In [117]:
print(confusion_matrix(y_test_white, y_pred_white_gaussian))
print(classification_report(y_test_white, y_pred_white_gaussian))
print(accuracy_score(y_test_white, y_pred_white_gaussian))

[[  0   0  30]
 [  0   0  41]
 [  0   1 908]]
              precision    recall  f1-score   support

        high       0.00      0.00      0.00        30
         low       0.00      0.00      0.00        41
      medium       0.93      1.00      0.96       909

    accuracy                           0.93       980
   macro avg       0.31      0.33      0.32       980
weighted avg       0.86      0.93      0.89       980

0.926530612244898


### 1.4.4- Sigmoid kernel

#### 1.4.4.1- Training the algorithm

##### 1.4.4.1.1- Red wine

In [118]:
red_sigmoid_svclassifier = SVC(kernel='sigmoid')
red_sigmoid_svclassifier.fit(X_train_red_scaled, y_train_red)

SVC(kernel='sigmoid')

##### 1.4.4.1.2- White wine

In [119]:
white_sigmoid_svclassifier = SVC(kernel='sigmoid')
white_sigmoid_svclassifier.fit(X_train_white_scaled, y_train_white)

SVC(kernel='sigmoid')

#### 1.4.4.2- Making prediction

##### 1.4.4.2.1- Red wine

In [120]:
y_pred_red_sigmoid= red_sigmoid_svclassifier.predict(X_test_red_scaled)

##### 1.4.4.2.2- White wine

In [121]:
y_pred_white_sigmoid= red_sigmoid_svclassifier.predict(X_test_white_scaled)

#### 1.4.4.3- Evaluating the algorithm

##### 1.4.4.3.1- Red wine

In [122]:
print(confusion_matrix(y_test_red, y_pred_red_sigmoid))
print(classification_report(y_test_red, y_pred_red_sigmoid))
print(accuracy_score(y_test_red, y_pred_red_sigmoid))

[[  0   0   2]
 [  0   1  12]
 [  0   1 304]]
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         2
         low       0.50      0.08      0.13        13
      medium       0.96      1.00      0.98       305

    accuracy                           0.95       320
   macro avg       0.49      0.36      0.37       320
weighted avg       0.93      0.95      0.94       320

0.953125


  _warn_prf(average, modifier, msg_start, len(result))


##### 1.4.4.3.2- White wine

In [123]:
print(confusion_matrix(y_test_white, y_pred_white_sigmoid))
print(classification_report(y_test_white, y_pred_white_sigmoid))
print(accuracy_score(y_test_white, y_pred_white_sigmoid))

[[  0   0  30]
 [  0   1  40]
 [  4   3 902]]
              precision    recall  f1-score   support

        high       0.00      0.00      0.00        30
         low       0.25      0.02      0.04        41
      medium       0.93      0.99      0.96       909

    accuracy                           0.92       980
   macro avg       0.39      0.34      0.33       980
weighted avg       0.87      0.92      0.89       980

0.9214285714285714


## 1.5- Decision Tree

A decision tree is one of most frequently and widely used supervised machine learning algorithms that can perform both regression and classification tasks. The intuition behind the decision tree algorithm is simple, yet also very powerful.

For each attribute in the dataset, the decision tree algorithm forms a node, where the most important attribute is placed at the root node. For evaluation we start at the root node and work our way down the tree by following the corresponding node that meets our condition or "decision". This process continues until a leaf node is reached, which contains the prediction or the outcome of the decision tree.

There are several advantages of using decision treess for predictive analysis:

1- Decision trees can be used to predict both continuous and discrete values i.e. they work well for both regression and classification tasks.

2- They require relatively less effort for training the algorithm.

3- They can be used to classify non-linearly separable data.

4- They're very fast and efficient compared to KNN and other classification algorithms.

### 1.5.1- Training the algorithm

#### 1.5.1.1- Red wine

In [124]:
red_tree_classifier = DecisionTreeClassifier()
red_tree_classifier.fit(X_train_red_scaled, y_train_red)

DecisionTreeClassifier()

#### 1.5.1.2- White wine

In [125]:
white_tree_classifier = DecisionTreeClassifier()
white_tree_classifier.fit(X_train_white_scaled, y_train_white)

DecisionTreeClassifier()

### 1.5.2- Making prediction

#### 1.5.2.1- Red wine

In [126]:
y_pred_red_tree = red_tree_classifier.predict(X_test_red_scaled)

In [127]:
df=pd.DataFrame({'Actual':y_test_red, 'Predicted':y_pred_red_tree})
df

Unnamed: 0,Actual,Predicted
31,medium,medium
883,medium,medium
592,medium,medium
552,medium,medium
360,medium,medium
...,...,...
734,medium,medium
1224,medium,medium
1187,medium,medium
1183,medium,medium


#### 1.5.2.2- White wine

In [128]:
y_pred_white_tree = white_tree_classifier.predict(X_test_white_scaled)

In [129]:
df=pd.DataFrame({'Actual':y_test_white, 'Predicted':y_pred_white_tree})
df

Unnamed: 0,Actual,Predicted
4647,medium,medium
1397,medium,medium
995,medium,medium
23,medium,low
616,medium,medium
...,...,...
1080,medium,medium
2670,medium,medium
2044,medium,medium
1724,medium,medium


### 1.5.3-  Evaluating the algorithm

#### 1.5.3.1- Red wine

In [130]:
print(confusion_matrix(y_test_red, y_pred_red_tree))
print(classification_report(y_test_red, y_pred_red_tree))
print(accuracy_score(y_test_red, y_pred_red_tree))

[[  0   0   2]
 [  0   3  10]
 [  8  12 285]]
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         2
         low       0.20      0.23      0.21        13
      medium       0.96      0.93      0.95       305

    accuracy                           0.90       320
   macro avg       0.39      0.39      0.39       320
weighted avg       0.92      0.90      0.91       320

0.9


#### 1.5.3.2- White wine

In [131]:
print(confusion_matrix(y_test_white, y_pred_white_tree))
print(classification_report(y_test_white, y_pred_white_tree))
print(accuracy_score(y_test_white, y_pred_white_tree))

[[  9   1  20]
 [  2  13  26]
 [ 32  30 847]]
              precision    recall  f1-score   support

        high       0.21      0.30      0.25        30
         low       0.30      0.32      0.31        41
      medium       0.95      0.93      0.94       909

    accuracy                           0.89       980
   macro avg       0.48      0.52      0.50       980
weighted avg       0.90      0.89      0.89       980

0.886734693877551


## 1.6- Random Forest

### 1.6.1- Red wine


In [132]:
rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train_red_scaled, y_train_red)
print(accuracy_score(y_test_red, rf_clf.predict(X_test_red_scaled)))

0.95625


### 1.6.1- White wine

In [133]:
rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train_white_scaled, y_train_white)
print(accuracy_score(y_test_white, rf_clf.predict(X_test_white_scaled)))

0.9346938775510204


## 1.7- K-Means

### 1.7.1- Red wine

In [80]:
X_kmeans = X_train_red[['PCA_1', 'PCA_2', 'PCA_3']]

kmeans = KMeans(n_clusters= 3, random_state=0, init='k-means++').fit(X_kmeans)
y_kmeans= kmeans.fit_predict(X_kmeans)

print('K-Means cluster center 1: ' + str(kmeans.cluster_centers_[:,0]))
print('K-Means cluster center 2: ' + str(kmeans.cluster_centers_[:,1]))
print('K-Means cluster center 3: ' + str(kmeans.cluster_centers_[:,2]))

KeyError: "None of [Index(['PCA_1', 'PCA_2', 'PCA_3'], dtype='object')] are in the [columns]"

### 1.7.2- White wine

# 2- Low alcohol vs high alcohol content

## 2.1- Exploratory data analysis

### 2.1.1- Red wine

In [135]:
# Read data with read_csv method of the pandas library.
alcohol_bankdata_red = pd.read_csv("data/new_format/winequality-red-alcohol-content.csv", sep = ',')
# see the rows and columns of the data
alcohol_bankdata_red.shape

(1599, 11)

In [137]:
# get a feel of how our dataset actually looks
alcohol_bankdata_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol content
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,low
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,low
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,low
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,low
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,low


In [138]:
# view some basic statistical details like percentile, mean, std etc.
alcohol_bankdata_red.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0


### 2.1.2- White wine

In [139]:
# Read data with read_csv method of the pandas library.
alcohol_bankdata_white = pd.read_csv("data/new_format/winequality-white-alcohol-content.csv", sep = ',')
# see the rows and columns of the data
alcohol_bankdata_white.shape

(4898, 11)

In [140]:
# get a feel of how our dataset actually looks
alcohol_bankdata_white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol content
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,low
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,low
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,low
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,low
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,low


In [141]:
# view some basic statistical details like mean, std etc.
alcohol_bankdata_white.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08


## 2.2- Data preprocessing

### 2.2.1- Split dataset in train and test

#### 2.2.1.1- Red wine

In [148]:
# In the first line of the script above, all the columns of the bankdata dataframe 
#are being stored in the X variable except the "quality" column, which is the label column. 
#The drop() method drops this column.
X_alcohol_red = alcohol_bankdata_red.drop('alcohol content', axis=1)
y_alcohol_red = alcohol_bankdata_red['alcohol content']

In [143]:
# divide data into training and test sets.
X_train_alcohol_red, X_test_alcohol_red, y_train_alcohol_red, y_test_alcohol_red = train_test_split(X_alcohol_red, y_alcohol_red, test_size = 0.20)
X_train_alcohol_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
885,8.9,0.75,0.14,2.5,0.086,9.0,30.0,0.99824,3.34,0.64
558,10.9,0.53,0.49,4.6,0.118,10.0,17.0,1.0002,3.07,0.56
423,10.5,0.24,0.47,2.1,0.066,6.0,24.0,0.9978,3.15,0.9
171,8.0,0.42,0.17,2.0,0.073,6.0,18.0,0.9972,3.29,0.61
1570,6.4,0.36,0.53,2.2,0.23,19.0,35.0,0.9934,3.37,0.93


In [144]:
y_train_alcohol_red.head()

885     high
558     high
423     high
171      low
1570    high
Name: alcohol content, dtype: object

#### 2.2.1.2- White wine

In [149]:
# In the first line of the script above, all the columns of the bankdata dataframe 
#are being stored in the X variable except the "quality" column, which is the label column. 
#The drop() method drops this column.
X_alcohol_white = alcohol_bankdata_white.drop('alcohol content', axis=1)
y_alcohol_white = alcohol_bankdata_white['alcohol content']

In [146]:
# divide data into training and test sets.
X_train_alcohol_white, X_test_alcohol_white, y_train_alcohol_white, y_test_alcohol_white = train_test_split(X_alcohol_white, y_alcohol_white, test_size = 0.20)
X_train_alcohol_white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
3451,6.6,0.21,0.29,5.35,0.029,43.0,106.0,0.99112,2.93,0.43
1089,7.0,0.17,0.33,4.0,0.034,17.0,127.0,0.9934,3.19,0.39
693,5.9,0.37,0.14,6.3,0.036,34.0,185.0,0.9944,3.17,0.63
1738,7.5,0.15,0.38,1.8,0.054,19.0,101.0,0.9946,3.24,0.44
4239,5.7,0.28,0.36,1.8,0.041,38.0,90.0,0.99002,3.27,0.98


In [147]:
y_train_alcohol_white.head()

3451    high
1089    high
693      low
1738     low
4239    high
Name: alcohol content, dtype: object

### 2.2.2- Data standarization

Standarization of datasets is a common requirement for many machine learning estimators. They might behave badly if the individual features do not more or less look like standard normally distributed data: Gaussian with zero mean and unit variance.

Here we are going to use StandardScaler, that standardize features by removing the mean and scaling to unit variance. The standard score of a sample $x$ is calculated as:

$x_{scaled} = \dfrac{x - \mu}{\sigma}$

where $\mu$ is the mean of the training samples and $\sigma$ is the standard deviation of the training samples.

#### 2.2.2.1- Red wine

In [150]:
X_train_alcohol_red_scaled = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X_train_alcohol_red), columns=X_train_alcohol_red.columns.values)
X_train_alcohol_red_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
0,0.345752,1.232888,-0.665878,-0.026318,-0.037008,-0.665879,-0.505725,0.79658,0.180564,-0.090055
1,1.499613,0.010836,1.142643,1.422223,0.627612,-0.569292,-0.909698,1.836225,-1.547657,-0.577847
2,1.268841,-1.600051,1.039299,-0.302231,-0.452395,-0.955642,-0.692174,0.56319,-1.035591,1.495271
3,-0.173485,-0.60019,-0.510862,-0.371209,-0.30701,-0.955642,-0.878623,0.244931,-0.139477,-0.272977
4,-1.096573,-0.933477,1.349331,-0.233253,2.95378,0.299995,-0.350351,-1.770707,0.372589,1.678194


In [151]:
X_train_alcohol_red_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
count,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0
mean,1.111091e-15,1.930521e-16,-4.0277050000000006e-17,2.722173e-16,1.88191e-16,6.249887000000001e-17,-7.777637e-17,-3.859097e-14,-3.083278e-15,-1.777746e-16
std,1.000391,1.000391,1.000391,1.000391,1.000391,1.000391,1.000391,1.000391,1.000391,1.000391
min,-2.135048,-2.266624,-1.389287,-1.129969,-1.573941,-1.438579,-1.251521,-3.537043,-3.659927,-1.980251
25%,-0.6927222,-0.7668334,-0.9242384,-0.4401873,-0.3693179,-0.8107606,-0.7543236,-0.6037584,-0.6515423,-0.6388215
50%,-0.2311781,-0.04471186,-0.097486,-0.2332529,-0.1823937,-0.1829422,-0.2571265,0.006237439,-0.01146042,-0.2120029
75%,0.5188312,0.6079749,0.7809384,0.04265967,0.04606934,0.5897573,0.519744,0.5631901,0.5646133,0.3977379
max,4.384264,5.843356,3.777916,8.94084,10.86691,5.41913,7.542653,3.68743,4.469113,8.20242


In [152]:
X_test_alcohol_red_scaled = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X_test_alcohol_red), columns=X_test_alcohol_red.columns.values)
X_test_alcohol_red_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
0,-0.280286,-0.153874,0.253873,-0.438174,-0.052085,-0.073516,2.693521,0.010355,-0.739965,-0.643422
1,-0.110523,1.447418,-1.402025,-0.276309,0.065954,-0.257667,-0.597457,0.538814,0.42043,-1.119419
2,-1.355452,-0.153874,-1.402025,-0.923769,-0.689496,-0.073516,-0.597457,-1.215672,0.352171,-1.172307
3,-0.959338,0.989906,-1.000595,0.978146,-0.453418,-0.718045,-0.878737,-1.818117,0.829981,-0.802088
4,-0.053935,1.847741,-0.699523,-0.033511,0.160386,-0.257667,0.190128,0.242877,0.352171,-0.696311


In [153]:
X_test_alcohol_red_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
count,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0
mean,3.941292e-16,5.218048e-16,1.554312e-16,-5.828671000000001e-17,8.326673e-17,-1.665335e-17,-2.0816680000000002e-17,-4.584111e-14,-5.967449000000001e-17,-3.302913e-16
std,1.001566,1.001566,1.001566,1.001566,1.001566,1.001566,1.001566,1.001566,1.001566,1.001566
min,-1.864742,-1.926734,-1.402025,-1.004702,-1.751849,-1.178422,-1.160017,-3.244959,-2.651205,-1.489639
25%,-0.7329876,-0.7829534,-0.9504162,-0.5191064,-0.3825944,-0.8101201,-0.7662247,-0.5986955,-0.6717068,-0.5905336
50%,-0.2236983,-0.09668518,-0.02211,-0.2763088,-0.1701239,-0.2576674,-0.2739845,-0.01342618,0.01087871,-0.2732024
75%,0.5119419,0.646772,0.8058388,0.04742141,0.11317,0.4789362,0.3870239,0.5916604,0.5569471,0.4143486
max,4.076967,3.763573,2.411558,9.111867,8.918891,4.806482,6.490803,3.392498,2.604704,6.760973


#### 2.2.2.2- White wine

In [154]:
X_train_alcohol_white_scaled = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X_train_alcohol_white), columns=X_train_alcohol_white.columns.values)
X_train_alcohol_white_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
0,-0.316513,-0.670103,-0.369193,-0.203457,-0.777965,0.449297,-0.77048,-0.972769,-1.687436,-0.52286
1,0.161344,-1.066325,-0.040686,-0.469891,-0.54588,-1.079801,-0.279599,-0.211232,0.018844,-0.874379
2,-1.152763,0.914784,-1.601092,-0.015966,-0.453046,-0.080006,1.076167,0.122776,-0.112409,1.234736
3,0.758665,-1.264436,0.369947,-0.904081,0.382461,-0.962178,-0.887357,0.189578,0.346974,-0.43498
4,-1.391691,0.023285,0.205694,-0.904081,-0.220961,0.155239,-1.144485,-1.340178,0.543853,4.310529


In [155]:
X_train_alcohol_white_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
count,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0
mean,8.78204e-16,4.134858e-16,-4.814934e-16,-3.627069e-18,-3.300632e-16,1.809e-16,-1.659384e-16,-4.389025e-14,2.024811e-15,-6.438047e-17
std,1.000128,1.000128,1.000128,1.000128,1.000128,1.000128,1.000128,1.000128,1.000128,1.000128
min,-3.661512,-1.957825,-2.750865,-1.140912,-1.706305,-1.961972,-2.827505,-2.31214,-3.065585,-2.368336
25%,-0.6749058,-0.6701035,-0.5334458,-0.9238168,-0.4530457,-0.7269322,-0.7237298,-0.7656846,-0.703044,-0.6986197
50%,-0.07758467,-0.1748261,-0.1228126,-0.2478623,-0.1281266,-0.08000637,-0.09259725,-0.09432905,-0.0467825,-0.1713409
75%,0.6093347,0.4195068,0.4520738,0.6846588,0.1967925,0.6257309,0.678787,0.6972693,0.609479,0.5316974
max,4.58152,8.145834,10.88216,11.72689,13.93623,14.91691,7.036863,15.01284,4.087665,5.189327


In [156]:
X_test_alcohol_white_scaled = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X_test_alcohol_white), columns=X_test_alcohol_white.columns.values)
X_test_alcohol_white_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
0,-0.016323,-1.604953,-0.179346,-0.242342,-0.035649,-0.358523,-0.386973,0.064751,1.495826,-0.270792
1,0.328936,-0.505719,0.499262,-0.94964,-0.600543,-1.357852,-1.720593,-1.413187,0.393846,-0.790797
2,-0.937015,0.593515,-0.348998,-0.065517,0.007804,-0.358523,-0.047506,0.205826,1.977942,0.682551
3,1.709973,1.093167,3.128867,0.818605,-0.035649,-0.123387,0.801162,1.206793,-0.708135,0.682551
4,-0.476669,-1.105301,-0.09452,-0.792462,0.094711,0.346886,1.552839,-0.069607,2.11569,0.075878


In [157]:
X_test_alcohol_white_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
count,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0
mean,6.706653000000001e-17,-2.782355e-16,1.522592e-16,1.232574e-16,-1.812609e-16,-1.740105e-16,-2.247635e-16,-1.247075e-15,-1.196322e-15,-1.178196e-16
std,1.000511,1.000511,1.000511,1.000511,1.000511,1.000511,1.000511,1.000511,1.000511,1.000511
min,-2.778398,-2.004674,-2.808952,-1.126464,-1.426157,-1.828124,-3.078461,-2.279796,-2.361106,-2.09081
25%,-0.5917558,-0.7055797,-0.5186504,-0.9496395,-0.4267297,-0.6524431,-0.6779443,-0.7951403,-0.7081349,-0.7041294
50%,-0.1314099,-0.1059976,-0.09452039,-0.2325181,-0.1660096,-0.1233866,-0.09600088,-0.1031969,-0.08827088,-0.09745674
75%,0.5591088,0.4935846,0.4144356,0.7203692,0.1816173,0.5820222,0.6799237,0.6693614,0.6004669,0.4442152
max,8.500074,5.340207,5.673647,3.854091,9.784809,6.078332,3.298669,3.003831,4.319651,4.409254


## 2.3- SVM

In the case of the simple SVM we used "linear" as the value for the kernel parameter. However, for kernel SVM you can use Gaussian, polynomial, sigmoid, or computable kernel. We will implement linear, polynomial, Gaussian, and sigmoid kernels to see which one works better for our problem.

### 2.3.1- Linear kernel
#### 2.3.1.1- Training the algorithm

We have divided the data into training and testing sets. Now is the time to train our SVM on the training data. 

#### 2.3.1.1.1- Red wine

In [158]:
#  SVC class is called to train the algorithm on the training data
alcohol_red_linear_svclassifier = SVC(kernel='linear')
alcohol_red_linear_svclassifier.fit(X_train_alcohol_red_scaled, y_train_alcohol_red)

SVC(kernel='linear')

In [159]:
accuracy_score(y_test_alcohol_red, alcohol_red_linear_svclassifier.predict(X_test_alcohol_red_scaled))

0.828125

#### 2.3.1.1.2- White wine

In [160]:
#  SVC class is called to train the algorithm on the training data
alcohol_white_linear_svclassifier = SVC(kernel='linear')
alcohol_white_linear_svclassifier.fit(X_train_alcohol_white_scaled, y_train_alcohol_white)

SVC(kernel='linear')

In [161]:
accuracy_score(y_test_alcohol_white, alcohol_white_linear_svclassifier.predict(X_test_alcohol_white_scaled))

0.9377551020408164

#### 2.3.1.2- Making predictions

##### 2.3.1.2.1- Red wine

In [165]:
y_pred_alcohol_red_linear = alcohol_red_linear_svclassifier.predict(X_test_alcohol_red_scaled)

##### 2.3.1.2.2- White wine

In [164]:
y_pred_alcohol_white_linear = alcohol_white_linear_svclassifier.predict(X_test_alcohol_white_scaled)

#### 2.3.1.3- Evaluating the algorithm

##### 2.3.1.3.1- Red wine

In [166]:
print(confusion_matrix(y_test_alcohol_red, y_pred_alcohol_red_linear))
print(classification_report(y_test_alcohol_red, y_pred_alcohol_red_linear))
print(accuracy_score(y_test_alcohol_red, y_pred_alcohol_red_linear))

[[106  38]
 [ 17 159]]
              precision    recall  f1-score   support

        high       0.86      0.74      0.79       144
         low       0.81      0.90      0.85       176

    accuracy                           0.83       320
   macro avg       0.83      0.82      0.82       320
weighted avg       0.83      0.83      0.83       320

0.828125


##### 2.3.1.3.2- White wine

In [167]:
print(confusion_matrix(y_test_alcohol_white, y_pred_alcohol_white_linear))
print(classification_report(y_test_alcohol_white, y_pred_alcohol_white_linear))
print(accuracy_score(y_test_alcohol_white, y_pred_alcohol_white_linear))

[[392  37]
 [ 24 527]]
              precision    recall  f1-score   support

        high       0.94      0.91      0.93       429
         low       0.93      0.96      0.95       551

    accuracy                           0.94       980
   macro avg       0.94      0.94      0.94       980
weighted avg       0.94      0.94      0.94       980

0.9377551020408164


### 2.3.2- Polynomial kernel

#### 2.3.2.1- Training the algorithm

##### 2.3.2.1.1- Red wine

In [168]:
red_alcohol_poly_svclassifier = SVC(kernel='poly', degree=8)
red_alcohol_poly_svclassifier.fit(X_train_alcohol_red_scaled, y_train_alcohol_red)

SVC(degree=8, kernel='poly')

##### 2.3.2.1.2- White wine

In [169]:
white_alcohol_poly_svclassifier = SVC(kernel='poly', degree=8)
white_alcohol_poly_svclassifier.fit(X_train_alcohol_white_scaled, y_train_alcohol_white)

SVC(degree=8, kernel='poly')

#### 2.3.2.2- Making predictions

##### 2.3.2.2.1- Red wine

In [170]:
y_pred_alcohol_red_poly = red_alcohol_poly_svclassifier.predict(X_test_alcohol_red_scaled)

##### 2.3.2.2.2- White wine

In [171]:
y_pred_alcohol_white_poly = white_alcohol_poly_svclassifier.predict(X_test_alcohol_white_scaled)

#### 2.3.2.3- Evaluating the algorithm

##### 2.3.2.3.1- Red wine

In [172]:
print(confusion_matrix(y_test_alcohol_red, y_pred_alcohol_red_poly))
print(classification_report(y_test_alcohol_red, y_pred_alcohol_red_poly))
print(accuracy_score(y_test_alcohol_red, y_pred_alcohol_red_poly))

[[ 41 103]
 [ 14 162]]
              precision    recall  f1-score   support

        high       0.75      0.28      0.41       144
         low       0.61      0.92      0.73       176

    accuracy                           0.63       320
   macro avg       0.68      0.60      0.57       320
weighted avg       0.67      0.63      0.59       320

0.634375


##### 2..2.3.2- White wine

In [174]:
print(confusion_matrix(y_test_alcohol_white, y_pred_alcohol_white_poly))
print(classification_report(y_test_alcohol_white, y_pred_alcohol_white_poly))
print(accuracy_score(y_test_alcohol_white, y_pred_alcohol_white_poly))

[[370  59]
 [151 400]]
              precision    recall  f1-score   support

        high       0.71      0.86      0.78       429
         low       0.87      0.73      0.79       551

    accuracy                           0.79       980
   macro avg       0.79      0.79      0.79       980
weighted avg       0.80      0.79      0.79       980

0.7857142857142857


### 2.3.3- Gaussian kernel

#### 2.3.3.1- Training the algorithm

##### 2.3.3.1.1- Red wine

In [175]:
red_alcohol_gaussian_svclassifier = SVC(kernel='rbf')
red_alcohol_gaussian_svclassifier.fit(X_train_alcohol_red_scaled, y_train_alcohol_red)

SVC()

##### 2.3.3.1.2- White wine

In [177]:
white_alcohol_gaussian_svclassifier = SVC(kernel='rbf')
white_alcohol_gaussian_svclassifier.fit(X_train_alcohol_white_scaled, y_train_alcohol_white)

SVC()

#### 2.3.3.2- Making prediction

##### 2.3.3.2.1- Red wine

In [178]:
y_pred_alcohol_red_gaussian= red_alcohol_gaussian_svclassifier.predict(X_test_alcohol_red_scaled)

##### 2.3.3.2.2- White wine

In [179]:
y_pred_alcohol_white_gaussian= red_alcohol_gaussian_svclassifier.predict(X_test_alcohol_white_scaled)

#### 2.3.3.3- Evaluating the algorithm

##### 2.3.3.3.1- Red wine

In [180]:
print(confusion_matrix(y_test_alcohol_red, y_pred_alcohol_red_gaussian))
print(classification_report(y_test_alcohol_red, y_pred_alcohol_red_gaussian))
print(accuracy_score(y_test_alcohol_red, y_pred_alcohol_red_gaussian))

[[111  33]
 [ 13 163]]
              precision    recall  f1-score   support

        high       0.90      0.77      0.83       144
         low       0.83      0.93      0.88       176

    accuracy                           0.86       320
   macro avg       0.86      0.85      0.85       320
weighted avg       0.86      0.86      0.85       320

0.85625


##### 2.3.3.3.2- White wine

In [181]:
print(confusion_matrix(y_test_alcohol_white, y_pred_alcohol_white_gaussian))
print(classification_report(y_test_alcohol_white, y_pred_alcohol_white_gaussian))
print(accuracy_score(y_test_alcohol_white, y_pred_alcohol_white_gaussian))

[[291 138]
 [ 95 456]]
              precision    recall  f1-score   support

        high       0.75      0.68      0.71       429
         low       0.77      0.83      0.80       551

    accuracy                           0.76       980
   macro avg       0.76      0.75      0.76       980
weighted avg       0.76      0.76      0.76       980

0.7622448979591837


### 2.3.4- Sigmoid kernel

#### 2.3.4.1- Training the algorithm

##### 2.3.4.1.1- Red wine

In [182]:
red_alcohol_sigmoid_svclassifier = SVC(kernel='sigmoid')
red_alcohol_sigmoid_svclassifier.fit(X_train_alcohol_red_scaled, y_train_alcohol_red)

SVC(kernel='sigmoid')

##### 2.3.4.1.2- White wine

In [185]:
white_alcohol_sigmoid_svclassifier = SVC(kernel='sigmoid')
white_alcohol_sigmoid_svclassifier.fit(X_train_alcohol_white_scaled, y_train_alcohol_white)

SVC(kernel='sigmoid')

#### 2.3.4.2- Making prediction

##### 2.3.4.2.1- Red wine

In [184]:
y_pred_alcohol_red_sigmoid= red_alcohol_sigmoid_svclassifier.predict(X_test_alcohol_red_scaled)

##### 2.3.4.2.2- White wine

In [186]:
y_pred_alcohol_white_sigmoid= white_alcohol_sigmoid_svclassifier.predict(X_test_alcohol_white_scaled)

#### 2.3.4.3- Evaluating the algorithm

##### 2.3.4.3.1- Red wine

In [188]:
print(confusion_matrix(y_test_alcohol_red, y_pred_alcohol_red_sigmoid))
print(classification_report(y_test_alcohol_red, y_pred_alcohol_red_sigmoid))
print(accuracy_score(y_test_alcohol_red, y_pred_alcohol_red_sigmoid))

[[ 94  50]
 [ 44 132]]
              precision    recall  f1-score   support

        high       0.68      0.65      0.67       144
         low       0.73      0.75      0.74       176

    accuracy                           0.71       320
   macro avg       0.70      0.70      0.70       320
weighted avg       0.71      0.71      0.71       320

0.70625


##### 2.3.4.3.2- White wine

In [189]:
print(confusion_matrix(y_test_alcohol_white, y_pred_alcohol_white_sigmoid))
print(classification_report(y_test_alcohol_white, y_pred_alcohol_white_sigmoid))
print(accuracy_score(y_test_alcohol_white, y_pred_alcohol_white_sigmoid))

[[344  85]
 [ 88 463]]
              precision    recall  f1-score   support

        high       0.80      0.80      0.80       429
         low       0.84      0.84      0.84       551

    accuracy                           0.82       980
   macro avg       0.82      0.82      0.82       980
weighted avg       0.82      0.82      0.82       980

0.823469387755102


## 2.4- Decision Tree

A decision tree is one of most frequently and widely used supervised machine learning algorithms that can perform both regression and classification tasks. The intuition behind the decision tree algorithm is simple, yet also very powerful.

For each attribute in the dataset, the decision tree algorithm forms a node, where the most important attribute is placed at the root node. For evaluation we start at the root node and work our way down the tree by following the corresponding node that meets our condition or "decision". This process continues until a leaf node is reached, which contains the prediction or the outcome of the decision tree.

There are several advantages of using decision treess for predictive analysis:

1- Decision trees can be used to predict both continuous and discrete values i.e. they work well for both regression and classification tasks.

2- They require relatively less effort for training the algorithm.

3- They can be used to classify non-linearly separable data.

4- They're very fast and efficient compared to KNN and other classification algorithms.

### 2.4.1- Training the algorithm

#### 2.4.1.1- Red wine

In [192]:
red_alcohol_tree_classifier = DecisionTreeClassifier()
red_alcohol_tree_classifier.fit(X_train_alcohol_red_scaled, y_train_alcohol_red)

DecisionTreeClassifier()

#### 2.4.2.2- White wine

In [193]:
white_alcohol_tree_classifier = DecisionTreeClassifier()
white_alcohol_tree_classifier.fit(X_train_alcohol_white_scaled, y_train_alcohol_white)

DecisionTreeClassifier()

### 2.3.2- Making prediction

#### 2.3.2.1- Red wine

In [194]:
y_pred_alcohol_red_tree = red_alcohol_tree_classifier.predict(X_test_alcohol_red_scaled)
df_alcohol = pd.DataFrame({'Actual':y_test_alcohol_red, 'Predicted':y_pred_alcohol_red_tree})
df_alcohol

Unnamed: 0,Actual,Predicted
188,low,low
1263,low,low
1336,low,low
1471,high,high
1361,low,low
...,...,...
1423,high,low
144,high,high
797,high,low
1346,high,high


#### 2.4.2.2- White wine

In [195]:
y_pred_alcohol_white_tree = white_alcohol_tree_classifier.predict(X_test_alcohol_white_scaled)
df_alcohol = pd.DataFrame({'Actual':y_test_alcohol_white, 'Predicted':y_pred_alcohol_white_tree})
df_alcohol

Unnamed: 0,Actual,Predicted
770,low,low
491,high,high
4826,high,low
296,low,low
226,low,low
...,...,...
3450,high,high
3212,high,high
4488,high,high
3307,high,high


### 2.4.3-  Evaluating the algorithm

#### 2.4.3.1- Red wine

In [196]:
print(confusion_matrix(y_test_alcohol_red, y_pred_alcohol_red_tree))
print(classification_report(y_test_alcohol_red, y_pred_alcohol_red_tree))
print(accuracy_score(y_test_alcohol_red, y_pred_alcohol_red_tree))

[[107  37]
 [ 28 148]]
              precision    recall  f1-score   support

        high       0.79      0.74      0.77       144
         low       0.80      0.84      0.82       176

    accuracy                           0.80       320
   macro avg       0.80      0.79      0.79       320
weighted avg       0.80      0.80      0.80       320

0.796875


#### 2.4.3.2- White wine

In [197]:
print(confusion_matrix(y_test_alcohol_white, y_pred_alcohol_white_tree))
print(classification_report(y_test_alcohol_white, y_pred_alcohol_white_tree))
print(accuracy_score(y_test_alcohol_white, y_pred_alcohol_white_tree))

[[387  42]
 [ 42 509]]
              precision    recall  f1-score   support

        high       0.90      0.90      0.90       429
         low       0.92      0.92      0.92       551

    accuracy                           0.91       980
   macro avg       0.91      0.91      0.91       980
weighted avg       0.91      0.91      0.91       980

0.9142857142857143


## 2.5- Random Forest

### 2.5.1- Red wine


In [198]:
rf_alcohol_clf = RandomForestClassifier(random_state=0)
rf_alcohol_clf.fit(X_train_alcohol_red_scaled, y_train_alcohol_red)
print(accuracy_score(y_test_alcohol_red, rf_alcohol_clf.predict(X_test_alcohol_red_scaled)))

0.884375


In [None]:
rf_alcohol_clf = RandomForestClassifier(random_state=0)
rf_alcohol_clf.fit(X_train_alcohol_white_scaled, y_train_alcohol_white)
print(accuracy_score(y_test_alcohol_white, rf_alcohol_clf.predict(X_test_alcohol_white_scaled)))