In [216]:
# Importando bibliotecas padrão
import pandas as pd
import numpy as np

# Importando classificadores diversos
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Importando train/test split e Scaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [217]:
# Colocando os dados em um dataframe
df = pd.read_csv('winequality-red.csv', sep=';')

In [218]:
# Observando os 5 primeiros exemplos
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [219]:
# Shape do dataframe
# 1599 exemplos
# 12 colunas 
df.shape

(1599, 12)

In [220]:
# Verificando tipos de dados unicos
df.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

In [221]:
# Valores nulos em cada coluna
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [222]:
# % de valores nulos
df.isnull().mean()

fixed acidity           0.0
volatile acidity        0.0
citric acid             0.0
residual sugar          0.0
chlorides               0.0
free sulfur dioxide     0.0
total sulfur dioxide    0.0
density                 0.0
pH                      0.0
sulphates               0.0
alcohol                 0.0
quality                 0.0
dtype: float64

In [223]:
# Desvio padrão de cada coluna
df.std()

fixed acidity            1.741096
volatile acidity         0.179060
citric acid              0.194801
residual sugar           1.409928
chlorides                0.047065
free sulfur dioxide     10.460157
total sulfur dioxide    32.895324
density                  0.001887
pH                       0.154386
sulphates                0.169507
alcohol                  1.065668
quality                  0.807569
dtype: float64

In [224]:
# Mediana de cada coluna
df.median()

fixed acidity            7.90000
volatile acidity         0.52000
citric acid              0.26000
residual sugar           2.20000
chlorides                0.07900
free sulfur dioxide     14.00000
total sulfur dioxide    38.00000
density                  0.99675
pH                       3.31000
sulphates                0.62000
alcohol                 10.20000
quality                  6.00000
dtype: float64

In [225]:
# Correlação
df.corr()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.124052
volatile acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.390558
citric acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.226373
residual sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,0.013732
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.128907
free sulfur dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.050656
total sulfur dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.1851
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.174919
pH,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.057731
sulphates,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.251397


In [226]:
# Correlação entre 'fixed acidity' e 'pH'
df.corr()['fixed acidity']['pH']

-0.6829781945685299

In [227]:
# Correlação entre 'quality' e 'alcohol'
df.corr()['quality']['alcohol']

0.47616632400114156

In [228]:
# Numero de vinhos com quality = 5
(df['quality'] == 5).sum()

681

In [229]:
# Quality será nosso y, o restante os X.
y = df['quality']
X = df.drop('quality', axis = 1)

In [230]:
# Garantindo que dividiu
X.shape, y.shape

((1599, 11), (1599,))

In [239]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [238]:
# Valor minimo da coluna Fixed Acidity
X_scaled[:,0].min()

0.0

In [195]:
# Dividindo em train e test
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y, test_size=0.3, random_state=1)

In [196]:
# Verificando shapes de saída
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1119, 11), (480, 11), (1119,), (480,))

In [197]:
## Instanciando todos modelos

# Algoritmo KNN
clf_KNN = KNeighborsClassifier(n_neighbors=5)
# Algoritmo Árvore de Decisão
clf_arvore = DecisionTreeClassifier()
# Algoritmo Floresta Randômica
clf_floresta = RandomForestClassifier(max_depth=10, random_state=1)
# Algoritmo SVM
clf_svm = SVC(gamma='auto', kernel='rbf')
# Algoritmo MLP
clf_mlp = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(5,5), random_state=1)

In [198]:
# Fitando os algoritmos
clf_KNN.fit(X_train,y_train)
clf_arvore.fit(X_train,y_train)
clf_floresta.fit(X_train,y_train)
clf_svm.fit(X_train,y_train)
clf_mlp.fit(X_train,y_train);



In [199]:
print('KNN: ', clf_KNN.score(X_test,y_test))
print('Árvore de Decisão: ', clf_arvore.score(X_test,y_test))
print('Floresta Randômica: ', clf_floresta.score(X_test,y_test))
print('SVM: ', clf_svm.score(X_test,y_test))
print('MLP :', clf_mlp.score(X_test,y_test));

KNN:  0.5645833333333333
Árvore de Decisão:  0.6020833333333333
Floresta Randômica:  0.6791666666666667
SVM:  0.5854166666666667
MLP : 0.6


# Supondo saídas binárias para quality
#### Quality com nota > 5 = Bom
#### Quality com nota < 5 = Ruim

In [200]:
# Colocando valores binários
df.loc[df['quality'] > 5, 'quality_bin'] = 'bom'
df.loc[df['quality'] <= 5, 'quality_bin'] = 'ruim'

In [201]:
#Novo df
df = df.drop('quality', axis = 1)
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality_bin
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,ruim
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,ruim
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,ruim
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,bom
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,ruim
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,ruim
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,bom
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,bom
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,ruim


In [209]:
# Alocando X e y
X = df.drop('quality_bin', axis = 1)
y = df['quality_bin']

In [210]:
# Aplicando scaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [211]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=1)

In [214]:
# Instanciando e fitando a floresta aleatória
clf_floresta = RandomForestClassifier(max_depth=10, random_state=1)
clf_floresta.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [215]:
# Score final
clf_floresta.score(X_test, y_test)

0.8041666666666667