In [116]:
# packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

In [72]:

#import seaborn as sn
#import xgboost as xgb
#import matplotlib.pyplot as plt
#%matplotlib inline
#from sklearn.cluster import KMeans
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.metrics import classification_report, confusion_matrix

In [48]:
# data
data = pd.read_csv('winequality-white.csv', sep = ";")

### Inspection

In [49]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [50]:
data.shape
# 12 columns, 4898 rows

(4898, 12)

In [51]:
data.info()
# float represent characteristics, quality (int) represents the outcome - or what we will be trying to predict

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [7]:
# copy for targets
df = data.copy()

In [8]:
# view all columns & rows
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [9]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
5,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,6
7,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
8,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
9,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.9938,3.22,0.45,11.0,6


In [10]:
# revert to reasonable viewing length
pd.options.display.max_columns = None
pd.options.display.max_rows = 20

# task is to predict "Better Quality" wine

In [11]:
list(df.columns)

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [12]:
df['quality']

0       6
1       6
2       6
3       6
4       6
       ..
4893    6
4894    5
4895    6
4896    7
4897    6
Name: quality, Length: 4898, dtype: int64

In [13]:
# minimum wine quality score per data
df['quality'].min()

3

In [14]:
# maximum wine quality score per data
df['quality'].max()

9

In [15]:
# distince wine quality values
df['quality'].unique()

array([6, 5, 7, 8, 4, 3, 9])

In [16]:
# number of distinct qine quality ratings
len(df['quality'].unique())
# 7 different distinct wine qualities per data

7

In [17]:
# create columns to represent when specific wine quality in achieved 
quality_columns = pd.get_dummies(df['quality'])

In [18]:
quality_columns

Unnamed: 0,3,4,5,6,7,8,9
0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
4893,0,0,0,1,0,0,0
4894,0,0,1,0,0,0,0
4895,0,0,0,1,0,0,0
4896,0,0,0,0,1,0,0


In [19]:
# verify index outputs
quality_columns.iloc[:, 3]

0       1
1       1
2       1
3       1
4       1
       ..
4893    1
4894    0
4895    1
4896    0
4897    1
Name: 6, Length: 4898, dtype: uint8

In [20]:
# group the qualities according to order to facilitate further analysis
quality_columns.columns.values

array([3, 4, 5, 6, 7, 8, 9])

In [21]:
# drop the original quality column from data
df = df.drop(['quality'], axis = 1)

In [22]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [23]:
# group the quality_columns outputs according to number order for more succinct targeting 
quality_lowest = quality_columns.iloc[:, 0:1].max(axis = 1) # wines of quality 3 & 4
quality_low = quality_columns.iloc[:, 2:3].max(axis = 1) # wines of quality 5 & 6
quality_mid = quality_columns.iloc[:, 4:5].max(axis = 1) # wines of quality 7 & 8
quality_high = quality_columns.iloc[:, 6] # wines of quality 9

In [24]:
quality_high

0       0
1       0
2       0
3       0
4       0
       ..
4893    0
4894    0
4895    0
4896    0
4897    0
Name: 9, Length: 4898, dtype: uint8

In [25]:
quality_high.sum()

5

In [26]:
quality_mid

0       0
1       0
2       0
3       0
4       0
       ..
4893    0
4894    0
4895    0
4896    1
4897    0
Length: 4898, dtype: uint8

In [27]:
quality_mid.sum()

880

In [28]:
quality_low

0       0
1       0
2       0
3       0
4       0
       ..
4893    0
4894    1
4895    0
4896    0
4897    0
Length: 4898, dtype: uint8

In [29]:
quality_low.sum()

1457

In [30]:
quality_lowest

0       0
1       0
2       0
3       0
4       0
       ..
4893    0
4894    0
4895    0
4896    0
4897    0
Length: 4898, dtype: uint8

In [31]:
quality_lowest.sum()

20

In [32]:
df = pd.concat([df, quality_lowest, quality_low, quality_mid, quality_high], axis = 1)
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,0,1,2,9
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,0,0,0,0
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,0,0,0,0
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,0,0,0,0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,0,0,0,0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,0,0,0,0
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,0,1,0,0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,0,0,0,0
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,0,0,1,0


In [33]:
df.columns.values
# time to assign more meaningful names to the columns

array(['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 0,
       1, 2, 9], dtype=object)

In [34]:
column_names = ['Fixed Acidity', 'Volatile Acidity', 'Citric Acid', 'Residual Sugar', 'Chlorides', 'Free Sulfur Dioxide', 'Total Sulfur Dioxide',
                'Density', 'pH', 'Sulphates', 'Alcohol', 'quality_lowest', 'quality_low', 'quality_mid', 'quality_high'] 

In [35]:
df.columns = column_names

In [36]:
df.head()

Unnamed: 0,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,pH,Sulphates,Alcohol,quality_lowest,quality_low,quality_mid,quality_high
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,0,0,0,0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,0,0,0,0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,0,0,0,0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0,0,0,0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0,0,0,0


In [37]:
# reorder columns to have qualities at front
column_names_reordered = ['quality_lowest', 'quality_low', 'quality_mid', 'quality_high', 'Fixed Acidity', 'Volatile Acidity', 'Citric Acid', 'Residual Sugar', 'Chlorides', 'Free Sulfur Dioxide', 'Total Sulfur Dioxide',
                'Density', 'pH', 'Sulphates', 'Alcohol']

In [38]:
df = df[column_names_reordered]
df.head()

Unnamed: 0,quality_lowest,quality_low,quality_mid,quality_high,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,pH,Sulphates,Alcohol
0,0,0,0,0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,0,0,0,0,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,0,0,0,0,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,0,0,0,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,0,0,0,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [39]:
# store preprocessed data in csv
df_preprocessed = df.copy()

In [40]:
# for any potential down stream analysis
df_preprocessed.to_csv('WhiteWineDF.csv', index = False)

In [41]:
df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   quality_lowest        4898 non-null   uint8  
 1   quality_low           4898 non-null   uint8  
 2   quality_mid           4898 non-null   uint8  
 3   quality_high          4898 non-null   uint8  
 4   Fixed Acidity         4898 non-null   float64
 5   Volatile Acidity      4898 non-null   float64
 6   Citric Acid           4898 non-null   float64
 7   Residual Sugar        4898 non-null   float64
 8   Chlorides             4898 non-null   float64
 9   Free Sulfur Dioxide   4898 non-null   float64
 10  Total Sulfur Dioxide  4898 non-null   float64
 11  Density               4898 non-null   float64
 12  pH                    4898 non-null   float64
 13  Sulphates             4898 non-null   float64
 14  Alcohol               4898 non-null   float64
dtypes: float64(11), uint8

## Logistic Regression to predict quality

In [43]:
DF = pd.read_csv('WhiteWineDF.csv')

In [44]:
# highly reasonable that maybe half of the predictors will not have any merit or only casual effect of the target

In [53]:
# logistic regression to a classification ML algorithm. Classify target into 2 classes:
# Better Quality, Moderate Quality
# anything above the median will be Better, otherwise Moderate

In [54]:
targets = np.where(data['quality'] >
                   data['quality'].median(), 1,0)

In [55]:
DF['Better Quality'] = targets

In [56]:
DF.head()

Unnamed: 0,quality_lowest,quality_low,quality_mid,quality_high,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,pH,Sulphates,Alcohol,Better Quality
0,0,0,0,0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,0
1,0,0,0,0,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,0
2,0,0,0,0,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,0
3,0,0,0,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0
4,0,0,0,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0


In [58]:
targets.sum() / targets.shape[0]

0.21641486320947326

In [59]:
data_with_targets = DF

In [60]:
data_with_targets

Unnamed: 0,quality_lowest,quality_low,quality_mid,quality_high,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,pH,Sulphates,Alcohol,Better Quality
0,0,0,0,0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,0
1,0,0,0,0,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,0
2,0,0,0,0,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,0
3,0,0,0,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,0
4,0,0,0,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,0,0,0,0,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,0
4894,0,1,0,0,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,0
4895,0,0,0,0,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,0
4896,0,0,1,0,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,1


In [61]:
data_with_targets.shape

(4898, 16)

In [62]:
# select inputs for regression
data_with_targets.iloc[:, :-1]

Unnamed: 0,quality_lowest,quality_low,quality_mid,quality_high,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,pH,Sulphates,Alcohol
0,0,0,0,0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8
1,0,0,0,0,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
2,0,0,0,0,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
3,0,0,0,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
4,0,0,0,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,0,0,0,0,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2
4894,0,1,0,0,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6
4895,0,0,0,0,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4
4896,0,0,1,0,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8


In [63]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

In [65]:
unscaled_inputs.columns.values

array(['quality_lowest', 'quality_low', 'quality_mid', 'quality_high',
       'Fixed Acidity', 'Volatile Acidity', 'Citric Acid',
       'Residual Sugar', 'Chlorides', 'Free Sulfur Dioxide',
       'Total Sulfur Dioxide', 'Density', 'pH', 'Sulphates', 'Alcohol'],
      dtype=object)

## Standardize the data

In [67]:
quality_scaler = StandardScaler()

In [68]:
quality_scaler.fit(unscaled_inputs)

StandardScaler()

In [69]:
scaled_inputs = quality_scaler.transform(unscaled_inputs)

In [70]:
scaled_inputs

array([[-6.40315625e-02, -6.50709938e-01, -4.67989781e-01, ...,
        -1.24692128e+00, -3.49184257e-01, -1.39315246e+00],
       [-6.40315625e-02, -6.50709938e-01, -4.67989781e-01, ...,
         7.40028640e-01,  1.34184656e-03, -8.24275678e-01],
       [-6.40315625e-02, -6.50709938e-01, -4.67989781e-01, ...,
         4.75101984e-01, -4.36815783e-01, -3.36667007e-01],
       ...,
       [-6.40315625e-02, -6.50709938e-01, -4.67989781e-01, ...,
        -1.31315295e+00, -2.61552731e-01, -9.05543789e-01],
       [-6.40315625e-02, -6.50709938e-01,  2.13679880e+00, ...,
         1.00495530e+00, -9.62604939e-01,  1.85757201e+00],
       [-6.40315625e-02, -6.50709938e-01, -4.67989781e-01, ...,
         4.75101984e-01, -1.48839409e+00,  1.04489089e+00]])

## Split data into train & test, then shuffle

In [75]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.7, random_state = 20)
# random state to shuffle observations in same random way

In [76]:
print(x_train.shape, y_train.shape)

(3428, 15) (3428,)


In [77]:
print(x_test.shape, y_test.shape)

(1470, 15) (1470,)


## Logistic Regression with SkLearn

In [79]:
# train the model
reg = LogisticRegression()

In [80]:
reg.fit(x_train, y_train)

LogisticRegression()

In [81]:
# evaluate model accuracy 
reg.score(x_train, y_train)

0.9658693115519253

In [83]:
# In conclusion: utilizing logistic regression to predict wine quality has an accuracy of close to 97%
# i.e.: the model learned to classify ~97% of the observations correctly

## Manually inspect accuracy of the model

In [84]:
model_outputs = reg.predict(x_train)

array([1, 1, 0, ..., 1, 0, 0])

In [95]:
# to view entire array
np.set_printoptions(threshold=np.inf)

In [96]:
model_outputs

array([1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,

In [97]:
y_train

array([1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,

In [98]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [99]:
## finding the intercept and coefficients

In [100]:
reg.intercept_

array([-2.74741753])

In [101]:
reg.coef_

array([[-0.25395772, -1.67945449,  3.53599292,  0.26230848,  0.17977595,
        -0.1254916 , -0.060154  ,  1.14470448, -0.02788741,  0.37688939,
        -0.02803902, -1.14136681,  0.33344053,  0.03752277,  0.68884137]])

In [102]:
unscaled_inputs.columns.values

array(['quality_lowest', 'quality_low', 'quality_mid', 'quality_high',
       'Fixed Acidity', 'Volatile Acidity', 'Citric Acid',
       'Residual Sugar', 'Chlorides', 'Free Sulfur Dioxide',
       'Total Sulfur Dioxide', 'Density', 'pH', 'Sulphates', 'Alcohol'],
      dtype=object)

In [103]:
feature_name = unscaled_inputs.columns.values

In [104]:
summary_table = pd.DataFrame(columns = ['Feature Name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature Name,Coefficient
0,quality_lowest,-0.253958
1,quality_low,-1.679454
2,quality_mid,3.535993
3,quality_high,0.262308
4,Fixed Acidity,0.179776
5,Volatile Acidity,-0.125492
6,Citric Acid,-0.060154
7,Residual Sugar,1.144704
8,Chlorides,-0.027887
9,Free Sulfur Dioxide,0.376889


In [105]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-2.747418
1,quality_lowest,-0.253958
2,quality_low,-1.679454
3,quality_mid,3.535993
4,quality_high,0.262308
5,Fixed Acidity,0.179776
6,Volatile Acidity,-0.125492
7,Citric Acid,-0.060154
8,Residual Sugar,1.144704
9,Chlorides,-0.027887


In [106]:
# displaying and interpreting the coefficients

In [108]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
0,Intercept,-2.747418,0.064093
1,quality_lowest,-0.253958,0.775725
2,quality_low,-1.679454,0.186476
3,quality_mid,3.535993,34.329084
4,quality_high,0.262308,1.299927
5,Fixed Acidity,0.179776,1.196949
6,Volatile Acidity,-0.125492,0.882063
7,Citric Acid,-0.060154,0.94162
8,Residual Sugar,1.144704,3.141513
9,Chlorides,-0.027887,0.972498


## Testing the model

In [110]:
# based on data that the model has never seen before
reg.score(x_test, y_test)
# in 96% of the cases, the model will correctly predict if wine is of Better Quality

0.9605442176870749

In [111]:
# examine the probabilty of an output being 0 or 1

In [114]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba # first column shows the probability of being 0, second column: the probability of being 1

array([[1.35264841e-03, 9.98647352e-01],
       [9.98537657e-01, 1.46234317e-03],
       [9.98474075e-01, 1.52592457e-03],
       [9.88364471e-01, 1.16355295e-02],
       [9.70204352e-01, 2.97956476e-02],
       [1.14792808e-03, 9.98852072e-01],
       [9.58306941e-01, 4.16930585e-02],
       [9.99390301e-01, 6.09699215e-04],
       [9.96712498e-01, 3.28750151e-03],
       [9.99351750e-01, 6.48249910e-04],
       [9.94208209e-01, 5.79179118e-03],
       [3.15110205e-03, 9.96848898e-01],
       [9.13433126e-01, 8.65668745e-02],
       [9.99675407e-01, 3.24593049e-04],
       [9.99274212e-01, 7.25787975e-04],
       [9.99345711e-01, 6.54289075e-04],
       [9.32282800e-01, 6.77171997e-02],
       [9.43482145e-01, 5.65178547e-02],
       [9.02847949e-01, 9.71520512e-02],
       [9.09747697e-01, 9.02523035e-02],
       [9.99886232e-01, 1.13767792e-04],
       [9.58971154e-01, 4.10288456e-02],
       [9.99912284e-01, 8.77160056e-05],
       [8.21928492e-01, 1.78071508e-01],
       [6.016415

In [115]:
### Next steps
## Save the model
## Create a module
## Get new data, classify it, pass it through SQL, & analyze it in Tableau

## Saving the model & storing it for deployment

In [117]:
with open('LogRegModel', 'wb') as file:
    pickle.dump(reg, file)

In [118]:
with open('WineQualityScaler', 'wb') as file:
    pickle.dump(quality_scaler, file)