In [None]:
# importando bibliotecas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, make_scorer, f1_score,precision_score,recall_score, classification_report, confusion_matrix, plot_confusion_matrix

In [None]:
data = pd.read_csv("wine_quality.csv")

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed.acidity         6497 non-null   float64
 1   volatile.acidity      6497 non-null   float64
 2   citric.acid           6497 non-null   float64
 3   residual.sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free.sulfur.dioxide   6497 non-null   float64
 6   total.sulfur.dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 609.2 KB


In [None]:
data.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [None]:
def print_distribution(arr, print_nl=True):
    arr = np.unique(arr, return_counts=True)[1] / len(arr)
    for i in range(arr.shape[0]):
        print("Classe %d: %.2f%%" %(i, arr[i]*100))

    if print_nl:
        print("\n")

In [None]:
def taste(quality):
  if quality <= 5:
    return 1
  elif quality > 5 and quality <= 7:
    return 2
  else:
    return 3

In [None]:
new_data = data
new_data["taste"] = data["quality"].apply(taste)

In [None]:
# separação de target e features
X = new_data.iloc[:, 0:11].values
y = new_data.iloc[:, 12].values

In [None]:
print("Proporções por classe no dataset em geral:")
print_distribution(y)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=199)
fold = 0
for train, test in kf.split(X, y):
    print("Fold %d" %(fold))
    print_distribution(y[train], print_nl=(fold != 2))
    fold += 1

Proporções por classe no dataset em geral:
Classe 0: 36.69%
Classe 1: 60.26%
Classe 2: 3.05%


Fold 0
Classe 0: 36.69%
Classe 1: 60.27%
Classe 2: 3.04%


Fold 1
Classe 0: 36.69%
Classe 1: 60.27%
Classe 2: 3.04%


Fold 2
Classe 0: 36.69%
Classe 1: 60.27%
Classe 2: 3.04%
Fold 3
Classe 0: 36.69%
Classe 1: 60.27%
Classe 2: 3.04%


Fold 4
Classe 0: 36.70%
Classe 1: 60.25%
Classe 2: 3.04%


Fold 5
Classe 0: 36.70%
Classe 1: 60.25%
Classe 2: 3.04%


Fold 6
Classe 0: 36.70%
Classe 1: 60.25%
Classe 2: 3.04%


Fold 7
Classe 0: 36.70%
Classe 1: 60.24%
Classe 2: 3.06%


Fold 8
Classe 0: 36.70%
Classe 1: 60.24%
Classe 2: 3.06%


Fold 9
Classe 0: 36.70%
Classe 1: 60.26%
Classe 2: 3.04%




In [None]:
kf_results = list()
kf_conf_mat = list()
kf_prec = list()
kf_f1 = list()
kf_rec = list()

scores = {
  'accuracy': make_scorer(accuracy_score),
  'precision': make_scorer(precision_score, average = 'macro'),
  'recall': make_scorer(recall_score, average = 'macro'),
  'f1_macro': make_scorer(f1_score, average = 'macro'),
  'f1_weighted': make_scorer(f1_score, average = 'weighted')
}

In [23]:
for train_ix, test_ix in kf.split(X,y):
  X_train, X_test = X[train_ix, :], X[test_ix, :]
  y_train, y_test = y[train_ix], y[test_ix]
  
  classifier = SVC()

  parameters = {
    "C": [1], 
    "kernel": ["linear"]
  }
  # define search space
  search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring="accuracy", cv=3, refit=True)
  # execute search
  result = search.fit(X_train, y_train)
  # get the best performing model fit on the whole training set
  best_model = result.best_estimator_
  # evaluate model on the hold out dataset
  yhat = best_model.predict(X_test)
  # evaluate the model
  acc = accuracy_score(y_test, yhat)
  conf_mat = confusion_matrix(y_test, yhat)
  precision = precision_score(y_test, yhat, average = "macro")
  f1 = f1_score(y_test, yhat, average = "weighted")
  recall = recall_score(y_test, yhat, average = "macro")
  # store the result
  #kf_class_rep.append(class_rep)
  kf_conf_mat.append(conf_mat)
  kf_results.append(acc)
  kf_prec.append(precision)
  kf_f1.append(f1)
  kf_rec.append(recall)
  # report progress
  print(">acc=%.3f,>prec=%.3f,>f1=%.3f,>rec=%.3f est=%.3f, cfg=%s" % (acc,precision,f1,recall, result.best_score_, result.best_params_))
  print(conf_mat)

  _warn_prf(average, modifier, msg_start, len(result))


>acc=0.746,>prec=0.494,>f1=0.731,>rec=0.495 est=0.682, cfg={'C': 1, 'kernel': 'linear'}
[[150  89   0]
 [ 56 335   0]
 [  0  20   0]]


  _warn_prf(average, modifier, msg_start, len(result))


>acc=0.709,>prec=0.467,>f1=0.692,>rec=0.465 est=0.683, cfg={'C': 1, 'kernel': 'linear'}
[[132 107   0]
 [ 62 329   0]
 [  1  19   0]]


In [None]:
print('Accuracy: %.3f (%.3f)' % (np.mean(kf_results), np.std(kf_results)))
print('Precision: %.3f (%.3f)' % (np.mean(kf_prec), np.std(kf_prec)))
print('F1-score: %.3f (%.3f)' % (np.mean(kf_f1), np.std(kf_f1)))
print('Recall: %.3f (%.3f)' % (np.mean(kf_rec), np.std(kf_rec)))

In [None]:
kf_conf_mat

In [None]:
# matriz de confusão
plot_confusion_matrix(result, X_test, y_test)
plt.show()

In [None]:
model = SVC(C=1, kernel="linear")
model.fit(X_train, y_train)

SVC(C=1, kernel='linear')

In [None]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00        10
           4       0.00      0.00      0.00        51
           5       0.62      0.59      0.60       659
           6       0.50      0.78      0.61       837
           7       0.00      0.00      0.00       335
           8       0.00      0.00      0.00        55
           9       0.00      0.00      0.00         3

    accuracy                           0.54      1950
   macro avg       0.16      0.20      0.17      1950
weighted avg       0.42      0.54      0.47      1950



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
