In [1]:
import pandas as pd
from inflection import underscore, parameterize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from yellowbrick.classifier import ConfusionMatrix
from sklearn.metrics import accuracy_score, classification_report
from statsmodels.stats.descriptivestats import describe, Description

In [2]:
df = pd.read_csv('data/winequality-red.csv', sep=';', low_memory=False)
df.rename(columns={x: underscore(parameterize(x)) for x in df.columns}, inplace=True)
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [9]:
describe(df, stats=['nobs', 'missing', 'mean', 'median','mode','std','iqr','skew',
                    'kurtosis', 'min', 'max', 'coef_var','distinct'], percentiles=['25','50','75'])

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
nobs,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
missing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
iqr,2.1,0.25,0.33,0.7,0.02,14.0,40.0,0.002235,0.19,0.18,1.6,1.0
coef_var,0.209276,0.339244,0.718888,0.555351,0.538095,0.658911,0.707917,0.001893,0.046627,0.257551,0.102242,0.143287
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
skew,0.981829,0.670962,0.318039,4.536395,5.675017,1.249394,1.514109,0.071221,0.193502,2.426393,0.860021,0.217597
kurtosis,4.124856,4.217963,2.209717,31.524438,44.581708,5.01349,6.794172,3.927411,3.800671,14.679884,3.195654,3.292031


In [6]:
df.dtypes

fixed_acidity           float64
volatile_acidity        float64
citric_acid             float64
residual_sugar          float64
chlorides               float64
free_sulfur_dioxide     float64
total_sulfur_dioxide    float64
density                 float64
ph                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [3]:
df.isnull().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
ph                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [14]:
df.corr(method='pearson')

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
fixed_acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.124052
volatile_acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.390558
citric_acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.226373
residual_sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,0.013732
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.128907
free_sulfur_dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.050656
total_sulfur_dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.1851
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.174919
ph,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.057731
sulphates,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.251397


In [16]:
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [4]:
df.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [8]:
X_df = df.iloc[:,:11]
y_df = df['quality']

scaler = MinMaxScaler()
df_scale = scaler.fit_transform(X_df)

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.3, random_state=1)

In [12]:
df_scale.min()

0.0

In [17]:
clf_KNN = KNeighborsClassifier(n_neighbors=5)
clf_KNN.fit(X_train, y_train)
previsoes = clf_KNN.predict(X_test)

print(accuracy_score(y_test, previsoes) * 100)

# cm = ConfusionMatrix(clf_KNN)
# cm.fit(X_train, y_train)
# cm.score(X_test, y_test)

# print(classification_report(y_test, previsoes))

50.416666666666664


In [7]:
clf_arvore = DecisionTreeClassifier()
clf_arvore.fit(X_train, y_train)
previsoes = clf_arvore.predict(X_test)

print(accuracy_score(y_test, previsoes) * 100)

# cm = ConfusionMatrix(clf_arvore)
# cm.fit(X_train, y_train)
# cm.score(X_test, y_test)

# print(classification_report(y_test, previsoes))

60.20833333333333


In [8]:
clf_floresta = RandomForestClassifier(max_depth=10, random_state=1)
clf_floresta.fit(X_train, y_train)
previsoes = clf_floresta.predict(X_test)

print(accuracy_score(y_test, previsoes) * 100)

# cm = ConfusionMatrix(clf_floresta)
# cm.fit(X_train, y_train)
# cm.score(X_test, y_test)

# print(classification_report(y_test, previsoes))

68.125


In [34]:
clf_svm=SVC(gamma='auto',kernel='rbf')
clf_svm.fit(X_train, y_train)
previsoes = clf_svm.predict(X_test)

print(accuracy_score(y_test, previsoes) * 100)

# cm = ConfusionMatrix(clf_svm)
# cm.fit(X_train, y_train)
# cm.score(X_test, y_test)

# print(classification_report(y_test, previsoes))

56.458333333333336


In [33]:
clf_mlp = MLPClassifier( alpha=1e-5, hidden_layer_sizes=(5,5), random_state=1)
clf_mlp.fit(X_train, y_train)
previsoes = clf_mlp.predict(X_test)

print(accuracy_score(y_test, previsoes) * 100)

# cm = ConfusionMatrix(clf_mlp)
# cm.fit(X_train, y_train)
# cm.score(X_test, y_test)

# print(classification_report(y_test, previsoes))

48.75




In [27]:
df['qualit'] = df['quality'].apply(lambda x: 'bom' if x>5 else 'ruim')

X_df = df.iloc[:,:11]
y_df = df.iloc[:,12]

scaler = MinMaxScaler()
df_scale = scaler.fit_transform(X_df)

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.3, random_state=1)

clf_mlp = MLPClassifier( alpha=1e-5, hidden_layer_sizes=(5,5), random_state=1)
clf_mlp.fit(X_train, y_train)
previsoes = clf_mlp.predict(X_test)

print(accuracy_score(y_test, previsoes) * 100)

61.458333333333336


