# Algoritmos genéticos para otimização de hiper parâmetros

In [1]:

import random
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from scipy import stats
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous, Categorical, Integer


## Dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("diabetes.csv")

## Dataset pré-processamento

In [3]:
limit = 3

z_scores = stats.zscore(df['Pregnancies'])
outliers_p = df['Pregnancies'][abs(z_scores) > limit]

z_scores = stats.zscore(df['Glucose'])
outliers_g = df['Glucose'][abs(z_scores) > limit]

z_scores = stats.zscore(df['BloodPressure'])
outliers_b = df['BloodPressure'][abs(z_scores) > limit]

# body mass index (18.5 - 24.9 [Obesity > 30])
z_scores = stats.zscore(df['BMI'])
outliers_bmi = df['BMI'][abs(z_scores) > limit]

# Probability of diabetes based on family history
z_scores = stats.zscore(df['DiabetesPedigreeFunction'])
outliers_dpf = df['DiabetesPedigreeFunction'][abs(z_scores) > limit]

z_scores = stats.zscore(df['Age'])
outliers_a = df['Age'][abs(z_scores) > limit]

unique_outliers = list(set(outliers_g.index) | set(outliers_b.index) | set([9,49,60,81,145,371,426,494,522,684,706]))
df.drop(unique_outliers, inplace=True)

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_scaled = scaler.transform(X)

## Aplicação algoritmo genético

In [4]:
# Defines the possible values to search
param_grid = {'n_neighbors': Integer(5,20),
              'weights': Categorical(['uniform', 'distance']),
              'algorithm': Categorical(['ball_tree', 'kd_tree', 'brute', 'auto']),
              'leaf_size': Integer(30, 80)}


evolved_estimator = GASearchCV(estimator=KNeighborsClassifier(),
                               cv=None,
                               scoring="recall",
                               population_size=100,
                               generations=1000,
                               crossover_probability=0.2,
                               mutation_probability=0.7,
                               param_grid=param_grid)

evolved_estimator.fit(X_scaled, y)

print(evolved_estimator.best_params_)
print(evolved_estimator.best_score_)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	100   	0.511993	0.0283139  	0.554367   	0.429551   
1  	179   	0.533997	0.0111536  	0.554367   	0.481878   
2  	189   	0.540588	0.01019    	0.554367   	0.517959   
3  	181   	0.547814	0.00642353 	0.554367   	0.530122   
4  	179   	0.55205 	0.00217863 	0.554367   	0.542041   
5  	178   	0.553544	0.00159598 	0.554367   	0.550449   
6  	186   	0.55425 	0.000668423	0.554367   	0.550449   
7  	181   	0.554367	0          	0.554367   	0.554367   
8  	186   	0.554367	0          	0.554367   	0.554367   
9  	176   	0.554367	0          	0.554367   	0.554367   
10 	181   	0.55352 	0.00593845 	0.554367   	0.509959   
11 	181   	0.554367	0          	0.554367   	0.554367   
12 	180   	0.554367	0          	0.554367   	0.554367   
13 	178   	0.554367	0          	0.554367   	0.554367   
14 	180   	0.554367	0          	0.554367   	0.554367   
15 	184   	0.554367	0          	0.554367   	0.554367   
16 	186   	0.553964	0.00401244 	0.554367   	0.51