### Import relevant libraries

In [1]:
# import libraries and modules
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report

import seaborn as sns
sns.set()

In [2]:
# load the preprocessed CSV data
redwine_preprocessed = pd.read_csv('redwine_preprocessed.csv')
redwine_preprocessed.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,0.9978,3.51,0.56,9.4,5


### Classification
We would be using the classification analysis models. A classification model takes input values given for training and tries to use them to predict categories for the new data. In this case, to take in levels of physiochemical levels and determine which of the 3 classified quality groups it belongs to.

In order to do so, we must first determine the best model to use. The models we will be using are:
- Decision Tree Classifier
- Logistic Regression
- Random Forest Classifier
- Naive Bayes Classifier

#### Declare the inputs and the targets
We will proceed to model our data for predictive analysis, for this assignment I have decided to simplify the models in predicting by classifying the quality into 2 groups. The classified groups are:
- For quality levels 0 to 6 , it will be assigned to the number 0
- For quality levels 7 to 10 , it will be assigned to the number 1

In [3]:
targets = np.where(redwine_preprocessed['quality'] >= 7, 1, 0)

In [4]:
redwine_preprocessed['Review'] = targets

In [5]:
redwine_preprocessed['Review'].value_counts()

0    1385
1     217
Name: Review, dtype: int64

In [6]:
redwine_with_targets = redwine_preprocessed.drop(['quality'], axis=1)

In [7]:
inputs = redwine_with_targets.iloc[:,:-1]

#### Standardize the Data

In [8]:
scaler = StandardScaler()
scaler.fit_transform(inputs)

array([[-0.52744496,  0.96141534, -1.38765536, ...,  1.28896035,
        -0.57876992, -0.95704535],
       [-0.29750637,  1.96761705, -1.38765536, ..., -0.72122521,
         0.12990989, -0.58185055],
       [-0.29750637,  1.29681591, -1.18243264, ..., -0.33215704,
        -0.04726007, -0.58185055],
       ...,
       [-0.64241426,  0.57011467, -1.38765536, ...,  0.38113462,
        -0.46065662, -1.33224014],
       [-0.64241426,  0.57011467, -1.38765536, ...,  0.38113462,
        -0.46065662, -1.33224014],
       [-0.64241426,  0.57011467, -1.38765536, ...,  0.38113462,
        -0.46065662, -1.33224014]])

In [9]:
inputs_scaled = scaler.transform(inputs)

#### Train Test Split

In [10]:
x_train, x_test, y_train, y_test = train_test_split(inputs_scaled, targets, test_size=0.3, random_state=0)

#### Create the Classification
##### Decision Tree Classifier

In [11]:
dtclass = DecisionTreeClassifier(random_state=1)

In [12]:
dtclass.fit(x_train,y_train)

DecisionTreeClassifier(random_state=1)

In [13]:
dt_y_hat = dtclass.predict(x_test)

##### Logistic Regression

In [14]:
logreg = LogisticRegression(random_state=1)

In [15]:
logreg.fit(x_train,y_train)

LogisticRegression(random_state=1)

In [16]:
log_y_hat = dtclass.predict(x_test)

##### Random Forest Classifier 

In [17]:
rfclass = RandomForestClassifier(random_state=1)

In [18]:
rfclass.fit(x_train,y_train)

RandomForestClassifier(random_state=1)

In [19]:
rf_y_hat = dtclass.predict(x_test)

##### Naive Bayes Classifier

In [20]:
nbclass = GaussianNB()

In [21]:
nbclass.fit(x_train,y_train)

GaussianNB()

In [22]:
nb_y_hat = dtclass.predict(x_test)

##### Gradient Boosted Classifier

In [23]:
gbclass = GradientBoostingClassifier(random_state=1)

In [24]:
gbclass.fit(x_train,y_train)

GradientBoostingClassifier(random_state=1)

In [25]:
gb_y_hat = gbclass.predict(x_test)

### Results

In [26]:
print("Decision Tree: \n",classification_report(y_test, dt_y_hat))
print("\nLogistic Regression: \n",classification_report(y_test, log_y_hat))
print("\nRandom Forest: \n",classification_report(y_test, rf_y_hat))
print("\nGaussian Naive Bayes: \n",classification_report(y_test, nb_y_hat))
print("\nGradient Boosted: \n",classification_report(y_test, gb_y_hat))

Decision Tree: 
               precision    recall  f1-score   support

           0       0.93      0.91      0.92       425
           1       0.43      0.52      0.47        56

    accuracy                           0.86       481
   macro avg       0.68      0.71      0.70       481
weighted avg       0.88      0.86      0.87       481


Logistic Regression: 
               precision    recall  f1-score   support

           0       0.93      0.91      0.92       425
           1       0.43      0.52      0.47        56

    accuracy                           0.86       481
   macro avg       0.68      0.71      0.70       481
weighted avg       0.88      0.86      0.87       481


Random Forest: 
               precision    recall  f1-score   support

           0       0.93      0.91      0.92       425
           1       0.43      0.52      0.47        56

    accuracy                           0.86       481
   macro avg       0.68      0.71      0.70       481
weighted avg   

#### Comparison for features

In [30]:
#only higher quality
rw_temp = redwine_preprocessed[redwine_preprocessed['Review']==1]
rw_temp.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,density,pH,sulphates,alcohol,quality,Review
count,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0
mean,8.847005,0.40553,0.376498,2.708756,0.075912,0.99603,3.288802,0.743456,11.518049,7.082949,1.0
std,1.999977,0.144963,0.194438,1.363026,0.02848,0.002201,0.154478,0.134038,0.998153,0.276443,0.0
min,4.9,0.12,0.0,1.2,0.012,0.99064,2.88,0.39,9.2,7.0,1.0
25%,7.4,0.3,0.3,2.0,0.062,0.9947,3.2,0.65,10.8,7.0,1.0
50%,8.7,0.37,0.4,2.3,0.073,0.99572,3.27,0.74,11.6,7.0,1.0
75%,10.1,0.49,0.49,2.7,0.085,0.99735,3.38,0.82,12.2,7.0,1.0
max,15.6,0.915,0.76,8.9,0.358,1.0032,3.78,1.36,14.0,8.0,1.0


In [29]:
#only lower quality
rw_temp2 = redwine_preprocessed[redwine_preprocessed['Review']==0]
rw_temp2.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,density,pH,sulphates,alcohol,quality,Review
count,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0
mean,8.234585,0.547202,0.253856,2.510794,0.089297,0.996859,3.314736,0.644614,10.248327,5.410108,0.0
std,1.681593,0.176188,0.189828,1.414529,0.049061,0.001807,0.153989,0.170471,0.970358,0.601695,0.0
min,4.6,0.16,0.0,0.9,0.034,0.99007,2.74,0.33,8.4,3.0,0.0
25%,7.1,0.42,0.08,1.9,0.071,0.9958,3.21,0.54,9.5,5.0,0.0
50%,7.8,0.54,0.24,2.2,0.08,0.9968,3.31,0.6,10.0,5.0,0.0
75%,9.1,0.65,0.4,2.6,0.091,0.9979,3.41,0.7,10.9,6.0,0.0
max,15.9,1.58,1.0,15.5,0.611,1.00369,4.01,2.0,14.9,6.0,0.0


By looking into the details, we can see that good quality wines have higher levels of alcohol on average, have a lower volatile acidity on average, higher levels of sulphates on average, and higher levels of residual sugar on average.