In [45]:
# Importando as bibliotecas necessárias

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

In [5]:
# Lendo a base de dados

base = pd.read_csv('exames.csv')
base.head()

Unnamed: 0.1,Unnamed: 0,id,diagnostico,exame_1,exame_2,exame_3,exame_4,exame_5,exame_6,exame_7,...,exame_24,exame_25,exame_26,exame_27,exame_28,exame_29,exame_30,exame_31,exame_32,exame_33
0,0,842302,M,17.99,10.38,122.8,103.78,1001.0,0.1184,0.2776,...,184.6,2019.0,0.1622,0.6656,0.7119,0.786,0.2654,0.4601,0.1189,
1,1,842517,M,20.57,17.77,132.9,103.78,1326.0,0.08474,0.07864,...,158.8,1956.0,0.1238,0.1866,0.2416,0.786,0.186,0.275,0.08902,
2,2,84300903,M,19.69,21.25,130.0,103.78,1203.0,0.1096,0.1599,...,152.5,1709.0,0.1444,0.4245,0.4504,0.786,0.243,0.3613,0.08758,
3,3,84348301,M,11.42,20.38,77.58,103.78,386.1,0.1425,0.2839,...,98.87,567.7,0.2098,0.8663,0.6869,0.786,0.2575,0.6638,0.173,
4,4,84358402,M,20.29,14.34,135.1,103.78,1297.0,0.1003,0.1328,...,152.2,1575.0,0.1374,0.205,0.4,0.786,0.1625,0.2364,0.07678,0.854454


In [6]:
# Removendo colunas desnecessárias

base = base.drop('Unnamed: 0', axis = 1)
base.head()

Unnamed: 0,id,diagnostico,exame_1,exame_2,exame_3,exame_4,exame_5,exame_6,exame_7,exame_8,...,exame_24,exame_25,exame_26,exame_27,exame_28,exame_29,exame_30,exame_31,exame_32,exame_33
0,842302,M,17.99,10.38,122.8,103.78,1001.0,0.1184,0.2776,0.3001,...,184.6,2019.0,0.1622,0.6656,0.7119,0.786,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,103.78,1326.0,0.08474,0.07864,0.0869,...,158.8,1956.0,0.1238,0.1866,0.2416,0.786,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,103.78,1203.0,0.1096,0.1599,0.1974,...,152.5,1709.0,0.1444,0.4245,0.4504,0.786,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,103.78,386.1,0.1425,0.2839,0.2414,...,98.87,567.7,0.2098,0.8663,0.6869,0.786,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,103.78,1297.0,0.1003,0.1328,0.198,...,152.2,1575.0,0.1374,0.205,0.4,0.786,0.1625,0.2364,0.07678,0.854454


In [15]:
# Verificando valores nulos:
nulos = base.isnull().sum()
nulos[nulos > 0]

exame_33    419
dtype: int64

In [16]:
# Removendo a coluna exame_33 que contém muitos valores nulos
base = base.drop('exame_33', axis = 1)

In [37]:
# Definindo os inputs e outputs e dividindo em treino e teste:

x = base.drop(['id', 'diagnostico'], axis = 1).values
y = base['diagnostico'].values.ravel()

In [41]:
# Definindo a random seed
SEED = 30

In [43]:
# Definindo a random seed do numpy
np.random.seed(SEED)

# Dividindo os dados em treino e teste
x_train, x_test, y_train, y_test = train_test_split(x, y)

# Instanciando e treinando o modelo a ser treinado
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(x_train, y_train)

# Calculando o score obtido com esse modelo:
score_rfc = rfc.score(x_test, y_test)*100

print(f'O modelo teve um score de {score_rfc:.2f}%')

O modelo teve um score de 95.80%


In [44]:
# Testando com um novo tamanho de conjunto de testes (o padrão é 25%)

# Definindo a random seed do numpy
np.random.seed(SEED)

# Dividindo os dados em treino e teste
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, test_size = 0.3)

# Instanciando e treinando o modelo a ser treinado
rfc2 = RandomForestClassifier(n_estimators=100)
rfc2.fit(x_train2, y_train2)

# Calculando o score obtido com esse modelo:
score_rfc2 = rfc2.score(x_test2, y_test2)*100

print(f'O modelo teve um score de {score_rfc2:.2f}%')

O modelo teve um score de 96.49%


In [46]:
# Utilizando um dummy classifier para definir uma baseline

# Definindo a random seed do numpy
np.random.seed(SEED)

# Instanciando e treinando o dummy classifier
dummy = DummyClassifier(strategy = 'most_frequent')
dummy.fit(x_train, y_train)

# Calculando o score obtido com esse modelo:
score_dummy = dummy.score(x_test, y_test)*100

print(f'O modelo teve um score de {score_dummy:.2f}%')

O modelo teve um score de 63.64%


Portando, a baseline deste modelo é o score de 96,49%