In [611]:
import pandas as pd
from pathlib import Path

data = pd.read_csv(Path("winequality-red.csv"))


In [612]:
from pandas.plotting import scatter_matrix

corr_matrix = data.corr()

corr_matrix.style.background_gradient(cmap="twilight")


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.124052
volatile acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.390558
citric acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.226373
residual sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,0.013732
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.128907
free sulfur dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.050656
total sulfur dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.1851
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.174919
pH,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.057731
sulphates,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.251397


There seems to be no correlation between quality and:
- pH
- free sulfur dioxide
- residual sugar
- fixed acidity

In [613]:
import numpy as np

# mark quality as either bad (0) or good (1)
data["quality"] = np.where(data["quality"] > 5, 1, 0)

data = data.drop("pH", axis=1)
data = data.drop("free sulfur dioxide", axis=1)
data = data.drop("residual sugar", axis=1)


In [614]:
from sklearn.model_selection import train_test_split

x = data.drop("quality", axis=1)
y = data["quality"]

x_train, x_test, y_train, y_test = train_test_split(
    x, y, shuffle=True, test_size=0.2, random_state=42)


In [615]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)


In [616]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

lr = LogisticRegression(max_iter=1000)
lr.fit(x_train, y_train)

lr_predict = lr.predict(x_test)
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)

print(lr_conf_matrix)
print(lr_acc_score)


[[109  32]
 [ 48 131]]
0.75


In [617]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt_predict = dt.predict(x_test)

dt_conf_matrix = confusion_matrix(y_test, dt_predict)
dt_acc_score = accuracy_score(y_test, dt_predict)
print(dt_conf_matrix)
print(dt_acc_score)


[[ 94  47]
 [ 46 133]]
0.709375


In [618]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(x_train, y_train)
nb_predict = nb.predict(x_test)
nb_conf_matrix = confusion_matrix(y_test, nb_predict)
nb_acc_score = accuracy_score(y_test, nb_predict)
print(nb_conf_matrix)
print(nb_acc_score)


[[102  39]
 [ 44 135]]
0.740625


In [619]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_features="sqrt")
rf.fit(x_train, y_train)
rf_predict = rf.predict(x_test)

rf_conf_matrix = confusion_matrix(y_test, rf_predict)
rf_acc_score = accuracy_score(y_test, rf_predict)
print(rf_conf_matrix)
print(rf_acc_score)


[[108  33]
 [ 32 147]]
79.6875


In [620]:
from sklearn.svm import SVC

rbf_svc = SVC(kernel="rbf")
rbf_svc.fit(x_train, y_train)
rbf_svc = rf.predict(x_test)
rbf_svc_conf_matrix = confusion_matrix(y_test, rf_predict)
rbf_svc_acc_score = accuracy_score(y_test, rf_predict)
print(rbf_svc_conf_matrix)
print(rbf_svc_acc_score)


[[108  33]
 [ 32 147]]
79.6875


In [621]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(x_train, y_train)
sgd_predict = sgd.predict(x_test)
sgd_conf_matrix = confusion_matrix(y_test, sgd_predict)
sgd_acc_score = accuracy_score(y_test, sgd_predict)
print(sgd_conf_matrix)
print(sgd_acc_score)

[[ 54  87]
 [ 18 161]]
0.671875


In [622]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_predict = knn.predict(x_test)

knn_conf_matrix = confusion_matrix(y_test, knn_predict)
knn_acc_score = accuracy_score(y_test, knn_predict)

print(knn_conf_matrix)
print(knn_acc_score)


[[ 95  46]
 [ 42 137]]
0.725
