In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder

# Carregamento e limpeza dos dados

In [None]:
dados = pd.read_csv("diabetes_prediction_dataset.csv")
dados.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


Inicialmente, precisamos entender os dados apresentados:
1) Gender: sexo biológico do indivíduo
2) Age: idade do indivíduo
3) Hypertension: se o paciente foi diagnosticado com hipertensão anteriormente
4) Hert disease: se o paciente possui algum problema de saúde diagnosticado
5) Smoking history: a relação do paciente com o cigarro
6) BMI: indíce de massa corporal
7) HbA1c level: média da quantidade de açucar no sangue do paciente nos últimos 3 meses
8) Blood glucose level: quantidade de glicose no sangue do paciente no momento da última medição
9) Diabetes: se o paciente possui ou não diagnóstico de diabetes

In [None]:
def sexo_binario(sexo):
    if sexo == 'Female':
        return 0
    else:
        return 1
    
dados['gender'] = dados['gender'].apply(sexo_binario)
dados['age'] = dados['age'].astype(int)

dados.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80,0,1,never,25.19,6.6,140,0
1,0,54,0,0,No Info,27.32,6.6,80,0
2,1,28,0,0,never,27.32,5.7,158,0
3,0,36,0,0,current,23.45,5.0,155,0
4,1,76,1,1,current,20.14,4.8,155,0


In [None]:
dados['smoking_history'].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [None]:
sh = LabelBinarizer().fit_transform(dados['smoking_history'])
sh = pd.DataFrame(data = sh, columns = ['smoking_no info', 'smoking_current', 'smoking_ever', 'smoking_former', 'smoking_never', 'smoking_not current'])
sh.head()

Unnamed: 0,smoking_no info,smoking_current,smoking_ever,smoking_former,smoking_never,smoking_not current
0,0,0,0,0,1,0
1,1,0,0,0,0,0
2,0,0,0,0,1,0
3,0,1,0,0,0,0
4,0,1,0,0,0,0


In [None]:
dados = dados.join(sh)
dados.drop('smoking_history', axis = 1, inplace = True)
dados.rename(columns = {'gender': 'sexo',
                               'age': 'idade',
                                'hypertension': 'hipertensao',
                                'heart_disease': 'doenca cardiaca',
                                'bmi': 'imc',
                                'HbA1c_level': 'a1c',
                                'glucose': 'glicose',
                                'diabetes': 'diabetes',
                                'smoking_no info': 'fumo_sem informacao',
                                'smoking_current': 'fumo_atualmente',
                                'smoking_ever': 'fumo_sempre',
                                'smoking_former': 'fumo_ex',
                                'smoking_never': 'fumo_nunca',
                                'smoking_not current': 'fumo_nao atualmente'}, inplace = True)

dados

Unnamed: 0,sexo,idade,hipertensao,doenca cardiaca,imc,a1c,blood_glucose_level,diabetes,fumo_sem informacao,fumo_atualmente,fumo_sempre,fumo_ex,fumo_nunca,fumo_nao atualmente
0,0,80,0,1,25.19,6.6,140,0,0,0,0,0,1,0
1,0,54,0,0,27.32,6.6,80,0,1,0,0,0,0,0
2,1,28,0,0,27.32,5.7,158,0,0,0,0,0,1,0
3,0,36,0,0,23.45,5.0,155,0,0,1,0,0,0,0
4,1,76,1,1,20.14,4.8,155,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,80,0,0,27.32,6.2,90,0,1,0,0,0,0,0
99996,0,2,0,0,17.37,6.5,100,0,1,0,0,0,0,0
99997,1,66,0,0,27.83,5.7,155,0,0,0,0,1,0,0
99998,0,24,0,0,35.42,4.0,100,0,0,0,0,0,1,0


# Análise exploratória