In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# import red wine csv
reds = pd.read_csv("winequality-red.csv", delimiter=';')
reds.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
reds['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [4]:
whites = pd.read_csv("winequality-white.csv", delimiter=';')
whites.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
whites['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [6]:
display(whites.info())

whites = whites.astype({'quality':str})

whites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   object 
dtypes: float64(11), object(1)
memory usage: 459.3+ KB


In [7]:
# unbalanced outcomes, combine 3+4 and 8+9

# test if it works
#whites['quality'].replace(to_replace=['3', '4', '8', '9'], value=['3-4', '3-4', '8-9', '8-9']).value_counts()

# replace
whites['quality'] = whites['quality'].replace(to_replace=['3', '4', '8', '9'], value=['3-4', '3-4', '8-9', '8-9'])

In [8]:
# test it worked
whites['quality'].value_counts()
# it does, good

6      2198
5      1457
7       880
3-4     183
8-9     180
Name: quality, dtype: int64

In [9]:
# split features and targets

Xwhites_unscaled = whites.drop(columns=['quality']).values
ywhites = whites['quality'].values

display(Xwhites_unscaled)
display(ywhites)

array([[ 7.  ,  0.27,  0.36, ...,  3.  ,  0.45,  8.8 ],
       [ 6.3 ,  0.3 ,  0.34, ...,  3.3 ,  0.49,  9.5 ],
       [ 8.1 ,  0.28,  0.4 , ...,  3.26,  0.44, 10.1 ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  2.99,  0.46,  9.4 ],
       [ 5.5 ,  0.29,  0.3 , ...,  3.34,  0.38, 12.8 ],
       [ 6.  ,  0.21,  0.38, ...,  3.26,  0.32, 11.8 ]])

array(['6', '6', '6', ..., '6', '7', '6'], dtype=object)

In [10]:
# split into training and testing data
# default 75%-25%
# stratify the ywhites column so we don't end up with all '8-9' in the testing data, for example

Xwhites_tr_unscaled, Xwhites_tes_unscaled, ywhites_tr, ywhites_tes = \
                train_test_split(Xwhites_unscaled, ywhites, stratify=ywhites, random_state=23)

In [11]:
display(pd.DataFrame(data=Xwhites_tr_unscaled).info())
display(pd.DataFrame(data=Xwhites_tes_unscaled).info())
display(pd.DataFrame(data=ywhites_tr).info())
display(pd.DataFrame(data=ywhites_tes).info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3673 entries, 0 to 3672
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       3673 non-null   float64
 1   1       3673 non-null   float64
 2   2       3673 non-null   float64
 3   3       3673 non-null   float64
 4   4       3673 non-null   float64
 5   5       3673 non-null   float64
 6   6       3673 non-null   float64
 7   7       3673 non-null   float64
 8   8       3673 non-null   float64
 9   9       3673 non-null   float64
 10  10      3673 non-null   float64
dtypes: float64(11)
memory usage: 315.8 KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1225 entries, 0 to 1224
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1225 non-null   float64
 1   1       1225 non-null   float64
 2   2       1225 non-null   float64
 3   3       1225 non-null   float64
 4   4       1225 non-null   float64
 5   5       1225 non-null   float64
 6   6       1225 non-null   float64
 7   7       1225 non-null   float64
 8   8       1225 non-null   float64
 9   9       1225 non-null   float64
 10  10      1225 non-null   float64
dtypes: float64(11)
memory usage: 105.4 KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3673 entries, 0 to 3672
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       3673 non-null   object
dtypes: object(1)
memory usage: 28.8+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1225 entries, 0 to 1224
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1225 non-null   object
dtypes: object(1)
memory usage: 9.7+ KB


None

In [12]:
# math for above
3673/(3673+1225)
# basically 75%

0.749897917517354

In [13]:
display(pd.DataFrame(data=ywhites_tr).value_counts())
display(pd.DataFrame(data=ywhites_tes).value_counts())

6      1648
5      1093
7       660
3-4     137
8-9     135
dtype: int64

6      550
5      364
7      220
3-4     46
8-9     45
dtype: int64

In [14]:
# math for above
550/(550+1648), 364/(364+1093), 46/(46+137), 45/(45+135)
# stratified on 4 of 5 columns, assume 5th is fine

(0.2502274795268426, 0.24982841455044613, 0.25136612021857924, 0.25)

In [15]:
# now scale using StandardScaler (lesson learned from unsupervised clustering)

whitesScaler = StandardScaler()

# fit to training data
whitesScaler.fit(Xwhites_tr_unscaled)

# transform training and testing data
Xwhites_tr_scaled = whitesScaler.transform(Xwhites_tr_unscaled)
Xwhites_tes_scaled = whitesScaler.transform(Xwhites_tes_unscaled)

In [16]:
# check scaling worked
display(pd.DataFrame(data=Xwhites_tr_scaled).describe())
display(pd.DataFrame(data=Xwhites_tes_scaled).describe())

# means 0, stds 1. worked
# didn't check the test data scaled well. investigate below

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0
mean,1.999272e-14,-4.923777e-15,2.705535e-14,2.072124e-15,-1.04065e-14,-1.864981e-16,-1.874824e-16,1.132526e-12,4.671344e-14,-4.120369e-14,-3.339042e-14
std,1.000136,1.000136,1.000136,1.000136,1.000136,1.000136,1.000136,1.000136,1.000136,1.000136,1.000136
min,-3.158749,-1.950526,-2.735752,-1.143271,-1.630478,-2.007653,-3.065887,-2.2964,-2.801268,-2.340446,-2.037223
25%,-0.6571956,-0.6773545,-0.5224262,-0.9278436,-0.4360557,-0.7445202,-0.7208638,-0.773402,-0.7169793,-0.6864942,-0.8240425
50%,-0.06158777,-0.1876733,-0.1945261,-0.2423935,-0.1263905,-0.08287912,-0.1049991,-0.08324192,-0.04462816,-0.1641936,-0.09613403
75%,0.5340201,0.3999441,0.3792991,0.6976522,0.1832747,0.6389111,0.6766754,0.7064605,0.627723,0.5322072,0.7126531
max,8.753409,8.03897,10.8721,11.62568,13.27769,6.683904,5.402253,14.91447,4.123949,5.145863,2.977257


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0
mean,0.014699,-0.036092,0.015129,-0.036239,-0.014983,-0.016789,-0.006828,-0.00464,0.043793,0.034281,-0.014859
std,1.02068,0.947002,0.967755,0.972519,0.858095,1.08888,1.026354,0.969253,1.059033,0.973145,0.981018
min,-3.635235,-1.950526,-2.735752,-1.104102,-1.409289,-1.947504,-2.852703,-2.289763,-3.137444,-1.905196,-1.713708
25%,-0.657196,-0.677355,-0.604401,-0.927844,-0.436056,-0.74452,-0.720864,-0.740221,-0.649744,-0.599444,-0.824042
50%,-0.061588,-0.187673,-0.112551,-0.281562,-0.12639,-0.082879,-0.128686,-0.109787,-0.044628,-0.077144,-0.177013
75%,0.53402,0.399944,0.461274,0.560562,0.183275,0.578762,0.700362,0.626827,0.694958,0.532207,0.712653
max,5.894491,4.953979,5.461751,4.927858,8.588472,15.255163,7.143255,5.398222,4.258419,4.275362,2.8155


In [17]:
# logistic regression
# instantiate model
# use random_state=23
# multi_class = 'multinomial' for multiple target classes
# solver='lbfgs' (default anyway)

whitesLogit = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=23)

# fit the logit model to training data
whitesLogit.fit(Xwhites_tr_scaled, ywhites_tr)

LogisticRegression(multi_class='multinomial', random_state=23)

In [18]:
# should be able to score without predicting
display(whitesLogit.score(Xwhites_tr_scaled, ywhites_tr))
display(whitesLogit.score(Xwhites_tes_scaled, ywhites_tes))

0.5505036754696433

0.5306122448979592

In [19]:
whitesLogit.predict(Xwhites_tes_scaled)

array(['6', '6', '6', ..., '6', '6', '6'], dtype=object)

In [20]:
print(classification_report(ywhites_tes, whitesLogit.predict(Xwhites_tes_scaled)))

              precision    recall  f1-score   support

         3-4       0.50      0.02      0.04        46
           5       0.58      0.53      0.55       364
           6       0.52      0.74      0.61       550
           7       0.49      0.21      0.30       220
         8-9       0.00      0.00      0.00        45

    accuracy                           0.53      1225
   macro avg       0.42      0.30      0.30      1225
weighted avg       0.51      0.53      0.49      1225



In [21]:
confusion_matrix(ywhites_tes, whitesLogit.predict(Xwhites_tes_scaled))

array([[  1,  22,  21,   1,   1],
       [  0, 194, 169,   1,   0],
       [  1, 109, 408,  32,   0],
       [  0,  11, 162,  47,   0],
       [  0,   1,  29,  15,   0]], dtype=int64)

In [22]:
whites.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2


In [23]:
# method from Ken
# explore removing outliers (White Wine)

# IQR Method (note uses muliplier of 3 to remove extreme outliers, 1.5 removed too many samples from the dataset)

Q1w=whites.quantile(0.25)
Q3w=whites.quantile(0.75)
IQRw=Q3w-Q1w
lower_boundw=Q1w - 3 * IQRw
upper_boundw=Q3w + 3 * IQRw

lower_boundw, upper_boundw

(fixed acidity            3.30000
 volatile acidity        -0.12000
 citric acid             -0.09000
 residual sugar         -22.90000
 chlorides               -0.00600
 free sulfur dioxide    -46.00000
 total sulfur dioxide   -69.00000
 density                  0.97859
 pH                       2.52000
 sulphates               -0.01000
 alcohol                  3.80000
 dtype: float64,
 fixed acidity            10.300000
 volatile acidity          0.650000
 citric acid               0.750000
 residual sugar           34.500000
 chlorides                 0.092000
 free sulfur dioxide     115.000000
 total sulfur dioxide    344.000000
 density                   1.009232
 pH                        3.850000
 sulphates                 0.970000
 alcohol                  17.100000
 dtype: float64)

In [24]:
# whites[~((whites < lower_boundw) | (whites > upper_boundw)).any(axis=1)]
whitesInliers = whites[~(whites > upper_boundw)].copy()

  


In [25]:
# confirm none are lower than Q1 - 3*IQR
# (had forgotten to at first)
whites[~(whites < lower_boundw)].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   object 
dtypes: float64(11), object(1)
memory usage: 459.3+ KB


  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
whitesInliers.dropna(inplace=True)
whitesInliers.reset_index(inplace=True, drop=True)

whitesInliers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4690 entries, 0 to 4689
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4690 non-null   float64
 1   volatile acidity      4690 non-null   float64
 2   citric acid           4690 non-null   float64
 3   residual sugar        4690 non-null   float64
 4   chlorides             4690 non-null   float64
 5   free sulfur dioxide   4690 non-null   float64
 6   total sulfur dioxide  4690 non-null   float64
 7   density               4690 non-null   float64
 8   pH                    4690 non-null   float64
 9   sulphates             4690 non-null   float64
 10  alcohol               4690 non-null   float64
 11  quality               4690 non-null   object 
dtypes: float64(11), object(1)
memory usage: 439.8+ KB


In [27]:
# repeat steps above on new DF

# set target to strs for replacement
whitesInliers = whitesInliers.astype({'quality':str})
# replace as above
whitesInliers['quality'] = whitesInliers['quality'].replace(to_replace=['3', '4', '8', '9'], value=['3-4', '3-4', '8-9', '8-9'])

# split into features and targets
XwhitesInliers_unscaled = whitesInliers.drop(columns=['quality']).values
ywhitesInliers = whitesInliers['quality'].values

# split into training and testing
XwhitesInliers_tr_unscaled, XwhitesInliers_tes_unscaled, ywhitesInliers_tr, ywhitesInliers_tes = \
                train_test_split(XwhitesInliers_unscaled, ywhitesInliers, stratify=ywhitesInliers, random_state=23)

# scale the data
# now scale using StandardScaler (lesson learned from unsupervised clustering)

whitesInliersScaler = StandardScaler()

# fit to training data
whitesInliersScaler.fit(XwhitesInliers_tr_unscaled)

# transform training and testing data
XwhitesInliers_tr_scaled = whitesInliersScaler.transform(XwhitesInliers_tr_unscaled)
XwhitesInliers_tes_scaled = whitesInliersScaler.transform(XwhitesInliers_tes_unscaled)

# check scaling worked
display(pd.DataFrame(data=XwhitesInliers_tr_scaled).describe())
display(pd.DataFrame(data=XwhitesInliers_tes_scaled).describe())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,3517.0,3517.0,3517.0,3517.0,3517.0,3517.0,3517.0,3517.0,3517.0,3517.0,3517.0
mean,-5.712519e-15,8.933618e-15,1.76229e-14,-3.968519e-15,-3.511853e-14,9.110332000000001e-17,6.376601e-18,1.128983e-12,1.961231e-14,-2.658335e-14,-4.539414e-15
std,1.000142,1.000142,1.000142,1.000142,1.000142,1.000142,1.000142,1.000142,1.000142,1.000142,1.000142
min,-3.565852,-2.123023,-2.994258,-1.161227,-3.128459,-2.061865,-3.060781,-2.353366,-3.144256,-2.430675,-1.755177
25%,-0.6536202,-0.6901449,-0.5449381,-0.9403268,-0.6430346,-0.7501816,-0.7321038,-0.7841958,-0.6885267,-0.718884,-0.8532734
50%,-0.04690526,-0.1390381,-0.09136038,-0.2575439,-0.09071792,-0.06310935,-0.08391522,-0.09746902,-0.0911871,-0.08822423,-0.115352
75%,0.5598097,0.52229,0.4529329,0.706385,0.6457043,0.686424,0.6843082,0.7266031,0.5725236,0.5424356,0.7045608
max,4.2001,4.159595,3.718693,3.949604,4.511921,4.808858,3.973265,3.078642,4.156561,4.326394,3.000316


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,1173.0,1173.0,1173.0,1173.0,1173.0,1173.0,1173.0,1173.0,1173.0,1173.0,1173.0
mean,0.054783,0.016897,0.022788,0.012465,-0.040964,0.020252,0.033428,0.004646,-0.077211,-0.031695,-0.009784
std,1.024361,0.995125,1.020211,1.00882,1.023501,1.037062,1.005088,1.006624,0.998689,1.031539,1.029545
min,-3.687195,-2.123023,-2.994258,-1.141145,-2.576143,-1.999404,-2.868725,-2.360234,-3.011514,-2.160392,-2.083143
25%,-0.65362,-0.690145,-0.544938,-0.940327,-0.735087,-0.750182,-0.660083,-0.784196,-0.754898,-0.808978,-0.853273
50%,-0.046905,-0.139038,-0.182076,-0.207339,-0.090718,-0.063109,-0.083915,-0.097469,-0.157558,-0.178318,-0.115352
75%,0.681153,0.52229,0.452933,0.746549,0.645704,0.623963,0.732322,0.760939,0.572524,0.452341,0.745556
max,3.593385,3.939152,3.718693,2.895307,4.14371,4.559013,4.957551,2.738712,3.691964,4.146206,2.754343


In [28]:
# logistic regression
# instantiate model
# use random_state=23
# multi_class = 'multinomial' for multiple target classes
# solver='lbfgs' (default anyway)

whitesInliersLogit = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=23)

# fit the logit model to training data
whitesInliersLogit.fit(XwhitesInliers_tr_scaled, ywhitesInliers_tr)

LogisticRegression(multi_class='multinomial', random_state=23)

In [29]:
print("Initial classification report:\n",classification_report(ywhites_tes, whitesLogit.predict(Xwhites_tes_scaled)))
print("Inliers classification report:\n",classification_report(ywhitesInliers_tes, whitesInliersLogit.predict(XwhitesInliers_tes_scaled)))

Initial classification report:
               precision    recall  f1-score   support

         3-4       0.50      0.02      0.04        46
           5       0.58      0.53      0.55       364
           6       0.52      0.74      0.61       550
           7       0.49      0.21      0.30       220
         8-9       0.00      0.00      0.00        45

    accuracy                           0.53      1225
   macro avg       0.42      0.30      0.30      1225
weighted avg       0.51      0.53      0.49      1225

Inliers classification report:
               precision    recall  f1-score   support

         3-4       0.50      0.05      0.09        39
           5       0.59      0.55      0.57       343
           6       0.54      0.75      0.63       529
           7       0.53      0.28      0.37       218
         8-9       0.00      0.00      0.00        44

    accuracy                           0.55      1173
   macro avg       0.43      0.33      0.33      1173
weighted avg 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
print("Initial confusion matrix:,\n",confusion_matrix(ywhites_tes, whitesLogit.predict(Xwhites_tes_scaled)))
print("Inliers confusion matrix:,\n",confusion_matrix(ywhitesInliers_tes, whitesInliersLogit.predict(XwhitesInliers_tes_scaled)))

Initial confusion matrix:,
 [[  1  22  21   1   1]
 [  0 194 169   1   0]
 [  1 109 408  32   0]
 [  0  11 162  47   0]
 [  0   1  29  15   0]]
Inliers confusion matrix:,
 [[  2  22  15   0   0]
 [  2 188 151   2   0]
 [  0  95 396  38   0]
 [  0  13 144  61   0]
 [  0   2  27  15   0]]


In [31]:
# raw logit score
display(whitesInliersLogit.score(XwhitesInliers_tr_scaled, ywhitesInliers_tr))
display(whitesInliersLogit.score(XwhitesInliers_tes_scaled, ywhitesInliers_tes))

0.5408018197327268

0.5515771526001705

In [32]:
whitesInliers['quality'].value_counts()

6      2114
5      1371
7       874
8-9     176
3-4     155
Name: quality, dtype: int64

In [33]:
# looking back, testing data is scaled funny
# try scaling test and train together, then splitting. probably weird to do, but let's see if it improves results


# scale using StandardScaler (lesson learned from unsupervised clustering)
whitesfullScaler = StandardScaler()

# fit to training data
whitesfullScaler.fit(Xwhites_unscaled)

# transform training and testing data
Xwhitesfull_scaled = whitesfullScaler.transform(Xwhites_unscaled)


# then split the scaled data into train and test sets
Xwhitesfull_tr_scaled, Xwhitesfull_tes_scaled, ywhitesfull_tr, ywhitesfull_tes = \
                train_test_split(Xwhitesfull_scaled, ywhites, stratify=ywhites, random_state=23)

In [34]:
# check new scaling worked
display(pd.DataFrame(data=Xwhitesfull_tr_scaled).describe())
display(pd.DataFrame(data=Xwhitesfull_tes_scaled).describe())
# dragged train set down to unscaledness of test data above

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0,3673.0
mean,-0.003657,0.009145,-0.003815,0.009125,0.003878,0.004105,0.001697,0.001169,-0.010789,-0.008631,0.003734
std,0.995035,1.013265,1.008243,1.006958,1.034899,0.977783,0.993625,1.007892,0.98521,1.006816,1.004951
min,-3.146295,-1.966985,-2.761743,-1.141944,-1.683274,-1.958677,-3.04423,-2.313038,-2.77025,-2.364709,-2.043297
25%,-0.657501,-0.677101,-0.530476,-0.925047,-0.447335,-0.723775,-0.714474,-0.77823,-0.717068,-0.69971,-0.824276
50%,-0.064931,-0.180992,-0.199917,-0.234922,-0.126906,-0.076922,-0.102619,-0.082718,-0.054751,-0.173921,-0.092863
75%,0.527639,0.414339,0.378559,0.711536,0.193523,0.628736,0.673966,0.713109,0.607565,0.527131,0.719818
max,8.705106,8.153643,10.95642,11.714112,13.743076,6.538623,5.368777,15.031298,4.051612,5.171602,2.995326


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0
mean,0.010966,-0.027421,0.011437,-0.027361,-0.011626,-0.012308,-0.005087,-0.003507,0.03235,0.025879,-0.011197
std,1.015475,0.959434,0.9756,0.979153,0.887921,1.064543,1.019672,0.976769,1.043228,0.979644,0.985741
min,-3.620351,-1.966985,-2.761743,-1.102508,-1.454396,-1.899872,-2.832434,-2.306351,-3.101408,-1.926552,-1.718225
25%,-0.657501,-0.677101,-0.613115,-0.925047,-0.447335,-0.723775,-0.714474,-0.744792,-0.650836,-0.612079,-0.824276
50%,-0.064931,-0.180992,-0.117278,-0.274357,-0.126906,-0.076922,-0.126152,-0.109468,-0.054751,-0.08629,-0.174131
75%,0.527639,0.414339,0.461199,0.573511,0.193523,0.569932,0.697499,0.632857,0.673797,0.527131,0.719818
max,5.86077,5.028155,5.502211,4.970598,8.890871,14.918314,7.098444,5.441254,4.184075,4.295287,2.832789


In [35]:
# try the logit now anyway
# logistic regression
# instantiate model
# use random_state=23
# multi_class = 'multinomial' for multiple target classes
# solver='lbfgs' (default anyway)

whitesfullLogit = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=23)

# fit the logit model to training data
whitesfullLogit.fit(Xwhitesfull_tr_scaled, ywhitesfull_tr)

LogisticRegression(multi_class='multinomial', random_state=23)

In [36]:
# raw accuracy scores
display(whitesfullLogit.score(Xwhitesfull_tr_scaled, ywhitesfull_tr))
display(whitesfullLogit.score(Xwhitesfull_tes_scaled, ywhitesfull_tes))

0.5505036754696433

0.5306122448979592

In [37]:
print("initial confusion:\n", confusion_matrix(ywhites_tes, whitesLogit.predict(Xwhites_tes_scaled)))
print("\"full\" confusion:\n", confusion_matrix(ywhitesfull_tes, whitesfullLogit.predict(Xwhitesfull_tes_scaled)))

initial confusion:
 [[  1  22  21   1   1]
 [  0 194 169   1   0]
 [  1 109 408  32   0]
 [  0  11 162  47   0]
 [  0   1  29  15   0]]
"full" confusion:
 [[  1  22  21   1   1]
 [  0 194 169   1   0]
 [  1 109 408  32   0]
 [  0  11 162  47   0]
 [  0   1  29  15   0]]


In [38]:
print("initial class report:\n", classification_report(ywhites_tes, whitesLogit.predict(Xwhites_tes_scaled)))
print("\"full\" class report:\n", classification_report(ywhitesfull_tes, whitesfullLogit.predict(Xwhitesfull_tes_scaled)))

initial class report:
               precision    recall  f1-score   support

         3-4       0.50      0.02      0.04        46
           5       0.58      0.53      0.55       364
           6       0.52      0.74      0.61       550
           7       0.49      0.21      0.30       220
         8-9       0.00      0.00      0.00        45

    accuracy                           0.53      1225
   macro avg       0.42      0.30      0.30      1225
weighted avg       0.51      0.53      0.49      1225

"full" class report:
               precision    recall  f1-score   support

         3-4       0.50      0.02      0.04        46
           5       0.58      0.53      0.55       364
           6       0.52      0.74      0.61       550
           7       0.49      0.21      0.30       220
         8-9       0.00      0.00      0.00        45

    accuracy                           0.53      1225
   macro avg       0.42      0.30      0.30      1225
weighted avg       0.51      0.5

In [39]:
# scores are the same, so the scaling wasn't the issue

In [40]:
# from brief discussion during class, try removing density column
whites_nodensity = whites.drop(columns=['density'])

In [41]:
# since removing outliers didn't do much, don't bother getting rid of them for this tr
# follow same process as initial

# split features and targets
# ywhites would be the same, but keep naming consistent
Xwhites_nodensity_unscaled = whites_nodensity.drop(columns=['quality']).values
ywhites_nodensity = whites_nodensity['quality'].values

# split into training and testing data
# default 75%-25%
# stratify the ywhites column so we don't end up with all '8-9' in the testing data, for example
Xwhites_nodens_tr_unscaled, Xwhites_nodens_tes_unscaled, ywhites_nodens_tr, ywhites_nodens_tes = \
                train_test_split(Xwhites_nodensity_unscaled, ywhites_nodensity, stratify=ywhites_nodensity, random_state=23)


# scale using StandardScaler
whitesnodensScaler = StandardScaler()

# fit to training data
whitesnodensScaler.fit(Xwhites_nodens_tr_unscaled)

# transform training and testing data
Xwhites_nodens_tr_scaled = whitesnodensScaler.transform(Xwhites_nodens_tr_unscaled)
Xwhites_nodens_tes_scaled = whitesnodensScaler.transform(Xwhites_nodens_tes_unscaled)

# logit
whitesnodensLogit = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=23)

# fit the logit model to training data
whitesnodensLogit.fit(Xwhites_nodens_tr_scaled, ywhites_nodens_tr)

LogisticRegression(multi_class='multinomial', random_state=23)

In [42]:
# raw accuracy scores
display(whitesnodensLogit.score(Xwhites_nodens_tr_scaled, ywhites_nodens_tr))
display(whitesnodensLogit.score(Xwhites_nodens_tes_scaled, ywhites_nodens_tes))

0.5507759324802614

0.5322448979591837

In [43]:
print("Initial confusion matrix:,\n", confusion_matrix(ywhites_tes, whitesLogit.predict(Xwhites_tes_scaled)))
print("nodens confusion matrix:,\n", confusion_matrix(ywhites_nodens_tes, whitesnodensLogit.predict(Xwhites_nodens_tes_scaled)))

Initial confusion matrix:,
 [[  1  22  21   1   1]
 [  0 194 169   1   0]
 [  1 109 408  32   0]
 [  0  11 162  47   0]
 [  0   1  29  15   0]]
nodens confusion matrix:,
 [[  1  21  22   1   1]
 [  0 195 166   3   0]
 [  1 108 412  29   0]
 [  0  11 165  44   0]
 [  0   1  28  16   0]]


In [44]:
print("initial class report:\n", classification_report(ywhites_tes, whitesLogit.predict(Xwhites_tes_scaled)))
print("nodens class report:\n", classification_report(ywhites_nodens_tes, whitesnodensLogit.predict(Xwhites_nodens_tes_scaled)))

initial class report:
               precision    recall  f1-score   support

         3-4       0.50      0.02      0.04        46
           5       0.58      0.53      0.55       364
           6       0.52      0.74      0.61       550
           7       0.49      0.21      0.30       220
         8-9       0.00      0.00      0.00        45

    accuracy                           0.53      1225
   macro avg       0.42      0.30      0.30      1225
weighted avg       0.51      0.53      0.49      1225

nodens class report:
               precision    recall  f1-score   support

         3-4       0.50      0.02      0.04        46
           5       0.58      0.54      0.56       364
           6       0.52      0.75      0.61       550
           7       0.47      0.20      0.28       220
         8-9       0.00      0.00      0.00        45

    accuracy                           0.53      1225
   macro avg       0.41      0.30      0.30      1225
weighted avg       0.51      0.5

In [45]:
# class '8-9' still not being guessed *at all* and class '3-4' being guessed only 1.
# will need to try group 3-5 together and 7-9

In [46]:
whites2 = whites.copy()

In [47]:
whites2['quality'].value_counts()
#whites.info()

6      2198
5      1457
7       880
3-4     183
8-9     180
Name: quality, dtype: int64

In [48]:
whites2['quality'] = whites2['quality'].replace(to_replace=['5', '3-4', '7', '8-9'],
                                              value=['3-5', '3-5', '7-9', '7-9'])

In [49]:
whites2['quality'].value_counts()
# more balanced now

6      2198
3-5    1640
7-9    1060
Name: quality, dtype: int64

In [58]:
# split features and targets

Xwhites2_unscaled = whites2.drop(columns=['quality']).values
ywhites2 = whites2['quality'].values


# split into training and testing data
# default 75%-25%
# stratify the ywhites column so we don't end up with all '8-9' in the testing data, for example

Xwhites2_tr_unscaled, Xwhites2_tes_unscaled, ywhites2_tr, ywhites2_tes = \
                train_test_split(Xwhites2_unscaled, ywhites2, stratify=ywhites2, random_state=23)
# now scale using StandardScaler (lesson learned from unsupervised clustering)

whites2Scaler = StandardScaler()

# fit to training data
whites2Scaler.fit(Xwhites2_tr_unscaled)

# transform training and testing data
Xwhites2_tr_scaled = whites2Scaler.transform(Xwhites2_tr_unscaled)
Xwhites2_tes_scaled = whites2Scaler.transform(Xwhites2_tes_unscaled)


# logit
whites2Logit = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=23)

# fit the logit model to training data
whites2Logit.fit(Xwhites2_tr_scaled, ywhites2_tr)


LogisticRegression(multi_class='multinomial', random_state=23)

In [59]:
# raw accuracy scores
display(whites2Logit.score(Xwhites2_tr_scaled, ywhites2_tr))
display(whites2Logit.score(Xwhites2_tes_scaled, ywhites2_tes))

0.5755513204465015

0.5885714285714285

In [60]:
confusion_matrix(ywhites2_tes, whites2Logit.predict(Xwhites2_tes_scaled))
# at least now each class is being predicted

array([[256, 146,   8],
       [130, 364,  56],
       [ 16, 148, 101]], dtype=int64)

In [61]:
print(classification_report(ywhites2_tes, whites2Logit.predict(Xwhites2_tes_scaled)))

              precision    recall  f1-score   support

         3-5       0.64      0.62      0.63       410
           6       0.55      0.66      0.60       550
         7-9       0.61      0.38      0.47       265

    accuracy                           0.59      1225
   macro avg       0.60      0.56      0.57      1225
weighted avg       0.59      0.59      0.58      1225



In [57]:
# try binary
whites3 = whites.copy()
whites3['quality'] = whites3['quality'].replace(to_replace=['5', '3-4', '6', '7', '8-9'],
                                              value=['3-6', '3-6', '3-6', '7-9', '7-9'])
whites3['quality'].value_counts()

3-6    3838
7-9    1060
Name: quality, dtype: int64

In [62]:
# split features and targets

Xwhites3_unscaled = whites3.drop(columns=['quality']).values
ywhites3 = whites3['quality'].values


# split into training and testing data
# default 75%-25%
# stratify the ywhites column so we don't end up with all '8-9' in the testing data, for example

Xwhites3_tr_unscaled, Xwhites3_tes_unscaled, ywhites3_tr, ywhites3_tes = \
                train_test_split(Xwhites3_unscaled, ywhites3, stratify=ywhites3, random_state=23)
# now scale using StandardScaler (lesson learned from unsupervised clustering)

whites3Scaler = StandardScaler()

# fit to training data
whites3Scaler.fit(Xwhites3_tr_unscaled)

# transform training and testing data
Xwhites3_tr_scaled = whites3Scaler.transform(Xwhites3_tr_unscaled)
Xwhites3_tes_scaled = whites3Scaler.transform(Xwhites3_tes_unscaled)


# logit
# leave off multi_class argument this time for binary
whites3Logit = LogisticRegression(solver='lbfgs', random_state=23)

# fit the logit model to training data
whites3Logit.fit(Xwhites3_tr_scaled, ywhites3_tr)


LogisticRegression(random_state=23)

In [63]:
# raw accuracy scores
display(whites3Logit.score(Xwhites3_tr_scaled, ywhites3_tr))
display(whites3Logit.score(Xwhites3_tes_scaled, ywhites3_tes))

0.7977130411108087

0.8114285714285714

In [64]:
confusion_matrix(ywhites3_tes, whites3Logit.predict(Xwhites3_tes_scaled))

array([[915,  45],
       [186,  79]], dtype=int64)

In [65]:
print(classification_report(ywhites3_tes, whites3Logit.predict(Xwhites3_tes_scaled)))

              precision    recall  f1-score   support

         3-6       0.83      0.95      0.89       960
         7-9       0.64      0.30      0.41       265

    accuracy                           0.81      1225
   macro avg       0.73      0.63      0.65      1225
weighted avg       0.79      0.81      0.78      1225



In [75]:
# didn't label encode the qualities and that might cause problems


whites4 = whites.copy()
whites4['quality'].value_counts()

6      2198
5      1457
7       880
3-4     183
8-9     180
Name: quality, dtype: int64

In [77]:
whites4['quality'] = whites4['quality'].replace(to_replace=['3-4', '5', '6', '7', '8-9'],
                                               value=[0, 1, 2, 3, 4]).astype(int)
whites4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int32  
dtypes: float64(11), int32(1)
memory usage: 440.2 KB


In [79]:
whites4['quality'].value_counts()

2    2198
1    1457
3     880
0     183
4     180
Name: quality, dtype: int64

In [80]:
# split features and targets

Xwhites4_unscaled = whites4.drop(columns=['quality']).values
ywhites4 = whites4['quality'].values


# split into training and testing data
# default 75%-25%
# stratify the ywhites column so we don't end up with all '8-9' in the testing data, for example

Xwhites4_tr_unscaled, Xwhites4_tes_unscaled, ywhites4_tr, ywhites4_tes = \
                train_test_split(Xwhites4_unscaled, ywhites4, stratify=ywhites4, random_state=23)
# now scale using StandardScaler (lesson learned from unsupervised clustering)

whites4Scaler = StandardScaler()

# fit to training data
whites4Scaler.fit(Xwhites4_tr_unscaled)

# transform training and testing data
Xwhites4_tr_scaled = whites4Scaler.transform(Xwhites4_tr_unscaled)
Xwhites4_tes_scaled = whites4Scaler.transform(Xwhites4_tes_unscaled)


# logit, multinomial again
whites4Logit = LogisticRegression(solver='lbfgs', random_state=23, multi_class='multinomial')

# fit the logit model to training data
whites4Logit.fit(Xwhites4_tr_scaled, ywhites4_tr)


LogisticRegression(multi_class='multinomial', random_state=23)

In [81]:
# raw accuracy scores
display(whites4Logit.score(Xwhites4_tr_scaled, ywhites4_tr))
display(whites4Logit.score(Xwhites4_tes_scaled, ywhites4_tes))

# scores didn't change, so the model must be encoding itself under the hood, if it'd matter

0.5505036754696433

0.5306122448979592

In [82]:
confusion_matrix(ywhites4_tes, whites4Logit.predict(Xwhites4_tes_scaled))

array([[  1,  22,  21,   1,   1],
       [  0, 194, 169,   1,   0],
       [  1, 109, 408,  32,   0],
       [  0,  11, 162,  47,   0],
       [  0,   1,  29,  15,   0]], dtype=int64)

In [87]:
# try new models on whites4 with diff c values

# logit, multinomial again, C=0.01 (default l2 penalty)
whites4LogitC001 = LogisticRegression(solver='lbfgs', random_state=23, multi_class='multinomial', C=0.01)
# fit the logit model to training data
whites4LogitC001.fit(Xwhites4_tr_scaled, ywhites4_tr)
print("c=0.01 train score: ", whites4LogitC001.score(Xwhites4_tr_scaled, ywhites4_tr))
print("c=0.01 test score: ", whites4LogitC001.score(Xwhites4_tes_scaled, ywhites4_tes))
print("c=0.01 confusion:\n", confusion_matrix(ywhites4_tes, whites4LogitC001.predict(Xwhites4_tes_scaled)))

# logit, multinomial again, C=0.1
whites4LogitC01 = LogisticRegression(solver='lbfgs', random_state=23, multi_class='multinomial', C=0.1)
# fit the logit model to training data
whites4LogitC01.fit(Xwhites4_tr_scaled, ywhites4_tr)
print("c=0.1 train score: ", whites4LogitC01.score(Xwhites4_tr_scaled, ywhites4_tr))
print("c=0.1 test score: ", whites4LogitC01.score(Xwhites4_tes_scaled, ywhites4_tes))
print("c=0.1 confusion:\n", confusion_matrix(ywhites4_tes, whites4LogitC01.predict(Xwhites4_tes_scaled)))


# logit, multinomial again, C=0.001
whites4LogitC0001 = LogisticRegression(solver='lbfgs', random_state=23, multi_class='multinomial', C=0.001)
# fit the logit model to training data
whites4LogitC0001.fit(Xwhites4_tr_scaled, ywhites4_tr)
print("c=0.001 train score: ", whites4LogitC0001.score(Xwhites4_tr_scaled, ywhites4_tr))
print("c=0.001 test score: ", whites4LogitC0001.score(Xwhites4_tes_scaled, ywhites4_tes))
print("c=0.001 confusion:\n", confusion_matrix(ywhites4_tes, whites4LogitC0001.predict(Xwhites4_tes_scaled)))

# logit, multinomial again, C=0.5
whites4LogitC05 = LogisticRegression(solver='lbfgs', random_state=23, multi_class='multinomial', C=0.5)
# fit the logit model to training data
whites4LogitC05.fit(Xwhites4_tr_scaled, ywhites4_tr)
print("c=0.5 train score: ", whites4LogitC05.score(Xwhites4_tr_scaled, ywhites4_tr))
print("c=0.5 test score: ", whites4LogitC05.score(Xwhites4_tes_scaled, ywhites4_tes))
print("c=0.5 confusion:\n", confusion_matrix(ywhites4_tes, whites4LogitC05.predict(Xwhites4_tes_scaled)))


# logit, multinomial again, no penalty
whites4Logitnopen = LogisticRegression(solver='lbfgs', random_state=23, multi_class='multinomial', penalty='none')
# fit the logit model to training data
whites4Logitnopen.fit(Xwhites4_tr_scaled, ywhites4_tr)
print("no pen train score: ", whites4Logitnopen.score(Xwhites4_tr_scaled, ywhites4_tr))
print("no pen test score: ", whites4Logitnopen.score(Xwhites4_tes_scaled, ywhites4_tes))
print("no pen confusion:\n", confusion_matrix(ywhites4_tes, whites4Logitnopen.predict(Xwhites4_tes_scaled)))

# logit, multinomial again, C=2.0
whites4LogitC20 = LogisticRegression(solver='lbfgs', random_state=23, multi_class='multinomial', C=2.0)
# fit the logit model to training data
whites4LogitC20.fit(Xwhites4_tr_scaled, ywhites4_tr)
print("c=2.0 train score: ", whites4LogitC20.score(Xwhites4_tr_scaled, ywhites4_tr))
print("c=2.0 test score: ", whites4LogitC20.score(Xwhites4_tes_scaled, ywhites4_tes))
print("c=2.0 confusion:\n", confusion_matrix(ywhites4_tes, whites4LogitC20.predict(Xwhites4_tes_scaled)))


c=0.01 train score:  0.5415191941192485
c=0.01 test score:  0.5248979591836734
c=0.01 confusion:
 [[  0  23  20   3   0]
 [  0 181 183   0   0]
 [  0 104 426  20   0]
 [  0  11 173  36   0]
 [  0   2  28  15   0]]
c=0.1 train score:  0.5505036754696433
c=0.1 test score:  0.5265306122448979
c=0.1 confusion:
 [[  1  22  21   1   1]
 [  0 189 174   1   0]
 [  1 107 410  32   0]
 [  0  11 164  45   0]
 [  0   2  28  15   0]]
c=0.001 train score:  0.5085760958344677
c=0.001 test score:  0.4840816326530612
c=0.001 confusion:
 [[  0  17  28   1   0]
 [  0 118 246   0   0]
 [  0  75 474   1   0]
 [  0   9 210   1   0]
 [  0   1  42   2   0]]
c=0.5 train score:  0.5491423904165532
c=0.5 test score:  0.5314285714285715
c=0.5 confusion:
 [[  1  22  21   1   1]
 [  0 194 169   1   0]
 [  1 109 410  30   0]
 [  0  11 163  46   0]
 [  0   1  29  15   0]]
no pen train score:  0.5496869044377892
no pen test score:  0.5289795918367347
no pen confusion:
 [[  1  22  21   1   1]
 [  1 193 169   1   0]
 [ 