In [21]:
import pandas as pd
import numpy as np


In [22]:
# URL: https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
df = pd.read_csv("./winequality-red.csv")
df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [24]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [25]:
df.mean()

fixed acidity            8.319637
volatile acidity         0.527821
citric acid              0.270976
residual sugar           2.538806
chlorides                0.087467
free sulfur dioxide     15.874922
total sulfur dioxide    46.467792
density                  0.996747
pH                       3.311113
sulphates                0.658149
alcohol                 10.422983
quality                  5.636023
dtype: float64

In [26]:
df.var()

fixed acidity              3.031416
volatile acidity           0.032062
citric acid                0.037947
residual sugar             1.987897
chlorides                  0.002215
free sulfur dioxide      109.414884
total sulfur dioxide    1082.102373
density                    0.000004
pH                         0.023835
sulphates                  0.028733
alcohol                    1.135647
quality                    0.652168
dtype: float64

In [27]:
df["quality"].mean()

5.6360225140712945

In [28]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
# Wine quality above mean is taken as Good quality wine and below mean as Bad quality wine; True is for good and False is for bad
y = [True if i > df["quality"].mean() else False for i in y]
y=np.reshape(y,(1599,))
print(y)

[False False False ...  True False  True]


In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=0)

In [30]:
from sklearn.preprocessing import StandardScaler,Normalizer
X_sca = StandardScaler()
X_train = X_sca.fit_transform(X_train)
X_test = X_sca.transform(X_test)


In [31]:
# Train with logistic regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [32]:
y_pred = clf.predict(X_test)

In [33]:

# Confusion matrix for the model
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


[[134  51]
 [ 48 167]]


In [34]:
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print(
    f"Score on training set : {train_score}\nScore on test set : {test_score}")

Score on training set : 0.7422852376980817
Score on test set : 0.7525


In [35]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

k=5

X_scaler1 = StandardScaler()
X1 = X_scaler1.fit_transform(X)

kf = KFold(n_splits=k, random_state=None)


In [36]:
# Library for Large Linear Classification algorithm; Some algorithms do well inspite of no scaling, but scaling gives better accuracy
model1 = LogisticRegression(solver='liblinear')

result = cross_val_score(model1, X, y, cv=kf)

print("Avg accuracy before scaling: {}".format(result.mean()))

result = cross_val_score(model1, X1, y, cv=kf)

print("Avg accuracy after scaling: {}".format(result.mean()))
# Not much increase in accuracy but increase by a little amount


Avg accuracy before scaling: 0.72983934169279
Avg accuracy after scaling: 0.7323432601880878


In [37]:
# A newton method
model2 = LogisticRegression(solver='newton-cg')

result = cross_val_score(model2, X, y, cv=kf)

# Multiple options for solver algorithms give similar average accuracy
print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: 0.7385971786833856


In [38]:
# Stochastic Average Gradient descent, needs scaled data
model3 = LogisticRegression(solver='sag')

result = cross_val_score(model3, X1, y, cv=kf)

# Multiple options for solver algorithms give similar average accuracy
print("Avg accuracy: {}".format(result.mean()))


Avg accuracy: 0.7323432601880878


In [39]:
# Default solver is lbfgs, needs data to be scaled; LBFGS- Limited-memory Broyden–Fletcher–Goldfarb–Shanno
model4 = LogisticRegression()

result = cross_val_score(model4, X1, y, cv=kf)

# Multiple options for solver algorithms give similar average accuracy
print("Avg accuracy: {}".format(result.mean()))


Avg accuracy: 0.7323432601880878


In [40]:
# Accuracy is dependant how data is scaled
X_norm2 = Normalizer()
X4 = X_norm2.fit_transform(X)

model4 = LogisticRegression()

result = cross_val_score(model4, X4, y, cv=kf)

print("Avg accuracy: {}".format(result.mean()))


Avg accuracy: 0.619739420062696
