Importando Bibliotecas necessárias

In [57]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

Imporando arquivo para execução das análises

In [None]:
df = pd.read_csv('winequality-red.csv', sep = ';' )
df.head()

Analisando Arquivo 

In [None]:
#Analisando tipo de Dados
df.info()

In [None]:
#Analisando a quantidade de valores nulos
df.isnull().sum()

In [8]:
#Analisando quantidade de tipos diferentes de dados
qtd = len(df.dtypes.value_counts())
print('Quantidade de tipos de dados diferentes:', qtd)

Quantidade de tipos de dados diferentes: 2


Desvio padrão contido na variável "fixed acidity" </br>
Mediana para a variável "residual sugar"

```
# Isto está formatado como código
```



In [None]:
df.describe()
print('Desvio padrão para a variável fixed-acidity:' , df['fixed acidity'].std())
print('Valor da mediana para a variável residual-sugar:', df['residual sugar'].median())

Coeficiente de correlação de Pearson entre "fixed acidity" e "pH"

In [None]:
df[['fixed acidity', 'pH']].corr()

In [None]:
#Calculando a correlação de Pearson
x = df['fixed acidity']
y = df['pH']
sns.regplot(x, y)
plt.xlabel('Fixed Acidity')
plt.ylabel('pH')
plt.show()


In [None]:
#Calculando a correlação de Person "quality" e "alcohol"
df[['quality', 'alcohol']].corr()

In [None]:
#Calculando dados
x = df['quality']
y = df['alcohol']
sns.regplot(x, y)
plt.xlabel('Quality')
plt.ylabel('Alcohol')
plt.show()

In [19]:
#Verificando a quantidade de instâncias existem para que a qualidade seja igual a 5
qual_5 = len(df[df['quality'] == 5])
print('Quantidade de instâncias na qual a qualidade do vinho se refere ao fator 5:', qual_5)

Quantidade de instâncias na qual a qualidade do vinho se refere ao fator 5: 681


Aplicando normalização MinMaxScaler (valores default) com os dados de entrada

In [27]:
#Menor valor para variável "fixed Acidity"
#Efetuando a normalização dos dados e exibindo
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
scaler = MinMaxScaler()
array_scaler = scaler.fit_transform(X)
col = X.columns
df_normalizado = pd.DataFrame(array_scaler, columns = col)
df_normalizado.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846
1,0.283186,0.520548,0.0,0.116438,0.143573,0.338028,0.215548,0.494126,0.362205,0.209581,0.215385
2,0.283186,0.438356,0.04,0.09589,0.133556,0.197183,0.169611,0.508811,0.409449,0.191617,0.215385
3,0.584071,0.109589,0.56,0.068493,0.105175,0.225352,0.190813,0.582232,0.330709,0.149701,0.215385
4,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846


In [28]:
#Localizando o referido valor minímo
df_normalizado['fixed acidity'].min()

0.0

Verificando acurácia e o KNN

In [30]:
#Efetuando a separação dos dados em treino e teste
X_train, X_test, Y_train, Y_test = train_test_split(df_normalizado, y, test_size = 0.3, random_state = 1)

In [53]:
#Calculando o KNN
c_knn = KNeighborsClassifier(n_neighbors = 5)
c_knn.fit(X_train, Y_train)
y_pred = c_knn.predict(X_test)

In [35]:
#Localizando Acurácia
acuracia = accuracy_score(Y_test, y_pred)
print('Acurácia do DataFrame:', acuracia)

Acurácia do DataFrame: 0.5645833333333333


Aplicando árvore de Decisão

In [52]:
#Árvore de Decisão
arvore = DecisionTreeClassifier()
arvore.fit(X_train, Y_train)
arvore_y = arvore.predict(X_test)
#Localizando a Acurácia
acuracia = accuracy_score(Y_test, arvore_y)
print('Acurácia da Árvore de Decisão:', acuracia)

Acurácia da Árvore de Decisão: 0.6041666666666666


Apicando Algoritmo de Floresta Randomica

In [49]:
#Localizando acurácia para floresta randomica
floresta = RandomForestClassifier(max_depth = 10, random_state = 1)
floresta.fit(X_train, Y_train)
floresta_y = floresta.predict(X_test)
acuracia = accuracy_score(Y_test, floresta_y)
#Exibindo Acurácia
print('Acurádia para Floresta Randomica:', acuracia)

Acurádia para Floresta Randomica: 0.6791666666666667


Algoritmo SVM

In [None]:
#Aplicando algoritmo SVM e localizando Acurácia
svm = SVC(gamma = 'auto', kernel = 'rbf')
svm.fit(X_train, Y_train)
svm_y = svm.predict(X_test)
acuracia_svm = accuracy_score(svm_y)
#Exibindo Acurácia
print('Acurácia do Algoritmo SVM:', acuracia_svm)

Algoritmo MLP

In [61]:
mlp = MLPClassifier(alpha = 1e-5, hidden_layer_sizes = (5,5), random_state = 1)
mlp.fit(X_train, Y_train)
mlp_y = mlp.predict(X_test)
acuracia = accuracy_score(Y_test, mlp_y)
#Exibindo Acurácia
print('Acurácia do Algoritmo MLP:', acuracia)

Acurácia do Algoritmo MLP: 0.6




Modifique o dataset para um sistema binário que considere vinhos com nota
maior do que 5 (6, 7, 8) sejam bons e menores ou igual a 5 ruins (5, 4, 3). Aplique
o modelo da floresta randômica

In [64]:
#Efetuando a cópia do DataFrame
df_modificado = df.copy()
df_modificado.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [70]:
#Segmentando os dados
Novo_X = df_modificado.iloc[:,:11]
Novo_y = df_modificado.quality
#Dicionario de Valores
Novo_Y = Novo_y.replace({6:1,7:1,8:1,5:0,4:0,3:0})
#Efetuando Normalização
new_scaler = MinMaxScaler()
new_array = new_scaler.fit_transform(Novo_X)
df_new = pd.DataFrame(new_array, columns = Novo_X.columns)
#Exibindo DataFrame modificado
df_new.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846
1,0.283186,0.520548,0.0,0.116438,0.143573,0.338028,0.215548,0.494126,0.362205,0.209581,0.215385
2,0.283186,0.438356,0.04,0.09589,0.133556,0.197183,0.169611,0.508811,0.409449,0.191617,0.215385
3,0.584071,0.109589,0.56,0.068493,0.105175,0.225352,0.190813,0.582232,0.330709,0.149701,0.215385
4,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846


In [75]:
#Separando dados para Treino e Teste
X_train, X_test, Y_train, Y_test = train_test_split(new_array, Novo_Y, test_size = 0.3, random_state = 1)
#Aplicando o RandomFlorest
florest = RandomForestClassifier(max_depth = 10, random_state = 1)
florest.fit(X_train, Y_train)
florest_y = florest.predict(X_test)
acuracia = accuracy_score(Y_test, floresta_y)
print('Acuracia da Floresta Randomica:', acuracia)

Acuracia da Floresta Randomica: 0.0
