In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn as sk

# Carregando a Base de Dados

https://www.kaggle.com/fedesoriano/stroke-prediction-dataset

In [2]:
dataset = pd.read_csv("stroke.csv")

dataset

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


# Pré-processamento da base de dados

1. Remoção de linhas nulas
2. Separação dos dados em X (variáveis independentes) e y (variável dependente)
3. Transformação das variáveis categóricas em numéricas (LabelEncoder e OneHotEncoder)
4. Separação em Teste e Treino
5. Normalização

In [3]:
dataset[dataset.isnull().any(axis=1)]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
13,8213,Male,78.0,0,1,Yes,Private,Urban,219.84,,Unknown,1
19,25226,Male,57.0,0,1,No,Govt_job,Urban,217.08,,Unknown,1
27,61843,Male,58.0,0,0,Yes,Private,Rural,189.84,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5039,42007,Male,41.0,0,0,No,Private,Rural,70.15,,formerly smoked,0
5048,28788,Male,40.0,0,0,Yes,Private,Urban,191.15,,smokes,0
5093,32235,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,,smokes,0
5099,7293,Male,40.0,0,0,Yes,Private,Rural,83.94,,smokes,0


In [4]:
#dataset[dataset.isnull().any(axis=1)] quantas linhas nulas?
dataset = dataset.dropna()

dataset

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [5]:
x = dataset.iloc[:, 1:-1]
y = dataset.iloc[:, -1]

x

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked
...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked


In [6]:
x.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
dtype: object

In [7]:
x['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [8]:
from sklearn.preprocessing import LabelEncoder

cols = ['ever_married', 'Residence_type'] #unique pra descobrir quantos rótulos únicos
le = LabelEncoder()
x[cols] = x[cols].apply(le.fit_transform)

x.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked
2,Male,80.0,0,1,1,Private,0,105.92,32.5,never smoked
3,Female,49.0,0,0,1,Private,1,171.23,34.4,smokes
4,Female,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked
5,Male,81.0,0,0,1,Private,1,186.21,29.0,formerly smoked
6,Male,74.0,1,1,1,Private,0,70.09,27.4,never smoked
7,Female,69.0,0,0,0,Private,1,94.39,22.8,never smoked
9,Female,78.0,0,0,1,Private,1,58.57,24.2,Unknown
10,Female,81.0,1,0,1,Private,0,80.43,29.7,never smoked
11,Female,61.0,0,1,1,Govt_job,0,120.46,36.8,smokes


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder 

#https://towardsdatascience.com/columntransformer-in-scikit-for-labelencoding-and-onehotencoding-in-machine-learning-c6255952731b
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['gender', 'work_type', 'smoking_status'])], 
                       remainder='passthrough')
x = ct.fit_transform(x)

x[0]

array([  0.  ,   1.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,
         0.  ,   1.  ,   0.  ,   0.  ,  67.  ,   0.  ,   1.  ,   1.  ,
         1.  , 228.69,  36.6 ])

In [10]:
colunas = ['ohe1', 'ohe2', 'ohe3', 'ohe4', 'ohe5', 'ohe6', 'ohe7', 'ohe8', 'ohe9', 'ohe10', 'ohe11', 
           'ohe12', 'age', 'hypertension', 'heart_disease', 'ever_married', 'Residence_type', 
           'avg_glucose_level', 'bmi']
pd.DataFrame(x, columns=colunas)

Unnamed: 0,ohe1,ohe2,ohe3,ohe4,ohe5,ohe6,ohe7,ohe8,ohe9,ohe10,ohe11,ohe12,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,67.0,0.0,1.0,1.0,1.0,228.69,36.6
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,80.0,0.0,1.0,1.0,0.0,105.92,32.5
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,49.0,0.0,0.0,1.0,1.0,171.23,34.4
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,79.0,1.0,0.0,1.0,0.0,174.12,24.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,81.0,0.0,0.0,1.0,1.0,186.21,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,103.08,18.6
4905,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,81.0,0.0,0.0,1.0,1.0,125.20,40.0
4906,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,35.0,0.0,0.0,1.0,0.0,82.99,30.6
4907,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,51.0,0.0,0.0,1.0,0.0,166.29,25.6


In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.9)

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

x_train

array([[-1.14512914,  1.14512914,  0.        , ...,  0.91389605,
        -0.28240474,  0.83321273],
       [ 0.87326395, -0.87326395,  0.        , ..., -1.09421635,
        -0.52591188, -0.19577335],
       [ 0.87326395, -0.87326395,  0.        , ...,  0.91389605,
        -0.639836  , -0.27809224],
       ...,
       [ 0.87326395, -0.87326395,  0.        , ..., -1.09421635,
        -1.08736263, -1.03268204],
       [ 0.87326395, -0.87326395,  0.        , ...,  0.91389605,
        -0.78530485, -1.01896222],
       [-1.14512914,  1.14512914,  0.        , ...,  0.91389605,
        -0.29216318, -0.85432445]])

# Arquitetura de Rede Neural (MLP)

In [13]:
tf.__version__

'2.6.0'

In [14]:
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

# GridSearch na MLP

https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

In [15]:
def create_model(optimizer='adam'):
    ann = Sequential()
    ann.add(Dense(units=6, activation='relu', kernel_initializer='he_normal'))
    ann.add(Dense(units=6, activation='relu', kernel_initializer='he_normal'))
    ann.add(Dense(units=1, activation='sigmoid'))
    ann.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return ann

In [16]:
# create model
model = KerasClassifier(build_fn=create_model, verbose=2)

In [18]:
# define the grid search parameters
optimizer = ['SGD', 'Adam']
batch_size = [16, 32, 64]
epochs = [10, 20, 30, 40, 50]
param_grid = dict(optimizer=optimizer, batch_size=batch_size, epochs=epochs)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=2, cv=5)
grid_result = grid.fit(x_train, y_train)

Epoch 1/10
31/31 - 3s - loss: 0.9849 - accuracy: 0.5551
Epoch 2/10
31/31 - 0s - loss: 0.6663 - accuracy: 0.7776
Epoch 3/10
31/31 - 0s - loss: 0.5080 - accuracy: 0.8755
Epoch 4/10
31/31 - 0s - loss: 0.4133 - accuracy: 0.9367
Epoch 5/10
31/31 - 0s - loss: 0.3521 - accuracy: 0.9571
Epoch 6/10
31/31 - 0s - loss: 0.3102 - accuracy: 0.9592
Epoch 7/10
31/31 - 0s - loss: 0.2806 - accuracy: 0.9633
Epoch 8/10
31/31 - 0s - loss: 0.2589 - accuracy: 0.9633
Epoch 9/10
31/31 - 0s - loss: 0.2422 - accuracy: 0.9653
Epoch 10/10
31/31 - 0s - loss: 0.2294 - accuracy: 0.9653


# Resultados

In [19]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.967347 using {'batch_size': 16, 'epochs': 10, 'optimizer': 'SGD'}
0.967347 (0.011900) with: {'batch_size': 16, 'epochs': 10, 'optimizer': 'SGD'}
0.959184 (0.014431) with: {'batch_size': 16, 'epochs': 10, 'optimizer': 'Adam'}
0.967347 (0.011900) with: {'batch_size': 16, 'epochs': 20, 'optimizer': 'SGD'}
0.965306 (0.013841) with: {'batch_size': 16, 'epochs': 20, 'optimizer': 'Adam'}
0.967347 (0.011900) with: {'batch_size': 16, 'epochs': 30, 'optimizer': 'SGD'}
0.967347 (0.011900) with: {'batch_size': 16, 'epochs': 30, 'optimizer': 'Adam'}
0.967347 (0.011900) with: {'batch_size': 16, 'epochs': 40, 'optimizer': 'SGD'}
0.967347 (0.011900) with: {'batch_size': 16, 'epochs': 40, 'optimizer': 'Adam'}
0.967347 (0.011900) with: {'batch_size': 16, 'epochs': 50, 'optimizer': 'SGD'}
0.965306 (0.012245) with: {'batch_size': 16, 'epochs': 50, 'optimizer': 'Adam'}
0.963265 (0.004999) with: {'batch_size': 32, 'epochs': 10, 'optimizer': 'SGD'}
0.930612 (0.039468) with: {'batch_size': 32, 'epochs