In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('agaricus-lepiota.data')

In [3]:
df.head(3)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m


In [4]:
# quantidade de valores categóricos únicos em cada coluna
df.apply('nunique')

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

In [5]:
#remoção de todos as amostras com features faltantes
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)
df.shape

(5644, 23)

In [6]:
#remoção da coluna 'veil-type', onde todos os valores são 'p'
del df['veil-type']

In [7]:
df['class'].unique()

array(['p', 'e'], dtype=object)

In [8]:
X = df.drop('class', axis=1)
y = df['class']

In [9]:
X.head(3)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,s,w,w,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,s,w,w,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,s,w,w,w,o,p,n,n,m


In [10]:
X['cap-color'].unique().shape

(8,)

In [11]:
y = y.to_numpy().reshape(-1,1)

### Pré-processamento dos dados:

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
#criando um encoder para cada coluna:
le_y = LabelEncoder()

#lista de encoders das colunas de X:
encoders_x = []
for i in range(len(X.columns.values)):
    encoders_x.append(LabelEncoder())

In [14]:
#como todas as colunas são categóricas, podemos iterar sobre todo o dataframe
y = le_y.fit_transform(df['class'])
for i, nome_da_classe in enumerate(df.columns.values[1:]):
    X[nome_da_classe] = encoders_x[i].fit_transform(X[nome_da_classe])

In [15]:
le_y.inverse_transform(y)

array(['p', 'e', 'e', ..., 'e', 'p', 'p'], dtype=object)

In [16]:
#exemplo, encoders_x[0] armazena o encoder para a coluna 0 (cap-shape) de X:
encoders_x[0].inverse_transform(X['cap-shape'])

array(['x', 'x', 'b', ..., 'x', 'x', 'f'], dtype=object)

In [17]:
X.head(2)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,2,0,...,2,2,5,5,0,1,3,1,3,5
1,5,2,7,1,0,1,0,0,2,0,...,2,2,5,5,0,1,3,2,2,1


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

### RNA:

In [19]:
import tensorflow as tf
from tensorflow import keras

In [20]:
model = keras.Sequential([ 
    keras.layers.Dense(11, input_shape=(21,), activation='relu', name='oculta1'),
    keras.layers.Dense(5, activation='relu', name='oculta2'),
    keras.layers.Dense(3, activation='relu', name='oculta3'),
    keras.layers.Dense(1, activation='sigmoid', name='saida')
])#testar 1 saida com sigmoid
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
oculta1 (Dense)              (None, 11)                242       
_________________________________________________________________
oculta2 (Dense)              (None, 5)                 60        
_________________________________________________________________
oculta3 (Dense)              (None, 3)                 18        
_________________________________________________________________
saida (Dense)                (None, 1)                 4         
Total params: 324
Trainable params: 324
Non-trainable params: 0
_________________________________________________________________


In [21]:
from tensorflow.keras.optimizers import Adam

In [22]:
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
model.fit(X_train,y_train, epochs = 15)

Train on 3781 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f54bc3b14d0>

In [24]:
model.evaluate(X_test,y_test)



[0.22194279333595557, 0.94632316]