## Comparativo entre Técnicas de Classificação: Regressão Logística

### Pipeline de Classificação

Importando as packages e funções:

In [0]:
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

Importando os dados. O conjunto de dados inclui informações demográficas hábitos e registros médicos históricos de 858 pacientes. O objetivo é predizer se um determinado paciente tem indicadores de câncer cervical.

Mais informações a respeito do dataset: [UCL](https://archive.ics.uci.edu/ml/datasets/Cervical+cancer+%28Risk+Factors%29)

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/intelligentagents/aprendizagem-supervisionada/master/data/risk_factors_cervical_cancer.csv')

Visualizando e descrevendo  o dataset

In [17]:
# Exporando o dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
age                                   858 non-null int64
number_of_sexual_partners             858 non-null object
first_sexual_intercourse              858 non-null object
Num_of_pregnancies                    858 non-null object
Smokes                                858 non-null object
Smokes_years                          858 non-null object
Smokes_packs_year                     858 non-null object
Hormonal_Contraceptives               858 non-null object
Hormonal_Contraceptive_years          858 non-null object
IUD                                   858 non-null object
IUD_years                             858 non-null object
STDs                                  858 non-null object
STDs_number                           858 non-null object
STDs_condylomatosis                   858 non-null object
STDs_cervical condylomatosis          858 non-null object
STDs_vaginal condylomatosi

In [18]:
df.head(5)

Unnamed: 0,age,number_of_sexual_partners,first_sexual_intercourse,Num_of_pregnancies,Smokes,Smokes_years,Smokes_packs_year,Hormonal_Contraceptives,Hormonal_Contraceptive_years,IUD,IUD_years,STDs,STDs_number,STDs_condylomatosis,STDs_cervical condylomatosis,STDs_vaginal condylomatosis,STDs_vulvo-perineal condylomatosis,STDs_syphilis,STDs_pelvic inflammatory disease,STDs_genital herpes,STDs_molluscum contagiosum,STDs_AIDS,STDs_HIV,STDs_Hepatitis B,STDs_HPV,STDs_ Number of diagnosis,STDs_ Time since first diagnosis,STDs_ Time since last diagnosis,Dx_Cancer,Dx_CIN,Dx_HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,?,?,0,0,0,0,0,0,0,0


Deletando colunas relacionadas a timestamp, visto que estão praticamente nulas:

In [0]:
df = df.drop(['STDs_ Time since first diagnosis', 'STDs_ Time since last diagnosis'], axis = 1)

 Preenchendo os valores númericos nulos (?) com a mediana:

In [0]:
# Substituindo valores com ? por NaN:
df = df.replace('?', np.NaN)

# Transformando todas as colunas em númericas:
df = df.apply(pd.to_numeric)
# Preenchendo
df = df.fillna(df.mean())

Visualizando o dataset após as transformações:

In [23]:
df.head(5)

Unnamed: 0,age,number_of_sexual_partners,first_sexual_intercourse,Num_of_pregnancies,Smokes,Smokes_years,Smokes_packs_year,Hormonal_Contraceptives,Hormonal_Contraceptive_years,IUD,IUD_years,STDs,STDs_number,STDs_condylomatosis,STDs_cervical condylomatosis,STDs_vaginal condylomatosis,STDs_vulvo-perineal condylomatosis,STDs_syphilis,STDs_pelvic inflammatory disease,STDs_genital herpes,STDs_molluscum contagiosum,STDs_AIDS,STDs_HIV,STDs_Hepatitis B,STDs_HPV,STDs_ Number of diagnosis,Dx_Cancer,Dx_CIN,Dx_HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
2,34,1.0,16.9953,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0


Definindo as variáveis indepedentes e dependentes

In [0]:
X = df.iloc[:, :35].values
y = df.iloc[:, -1].values

Dividindo o dataset em conjunto de treinamento e testes

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


Criando o dicionário contendo todos os classificadores:

In [0]:
estimators = {'Decision Tree': DecisionTreeClassifier(criterion = 'entropy', random_state = 0),
              'KNN': KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean'),
              'SVC': SVC(kernel = 'rbf', random_state = 0),
              'Random Forest' : RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0) ,
              'Naive Bayes' : GaussianNB(),
              'Logistic Regression' : LogisticRegression(random_state = 0)}

Criando dataframe que irá guardar os resultados finais dos classificadores:

In [0]:
df_results = pd.DataFrame(columns=['classifier', 'accuracy', 'precision', 'recall', 'f1'], index=None)

Percorrendo o dicionário e treinando e avaliando os modelos:

In [31]:
for name, estim in estimators.items():
    
    # print("Treinando Estimador {0}: ".format(name))
    
    # Treinando os classificadores com Conjunto de Treinamento
    estim.fit(X_train, y_train)
    
    # Prevendo os resultados do modelo criado com o conjunto de testes
    y_pred = estim.predict(X_test)
    
    
    # Armazenando as métricas de cada classificador em um dataframe
    df_results.loc[len(df_results), :] = [name, accuracy_score(y_test, y_pred), precision_score (y_test, y_pred, average = 'macro'),
                   recall_score(y_test, y_pred,  average = 'macro'), f1_score(y_test, y_pred,  average = 'macro')]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Exibindo os resultados finais:

In [32]:
df_results

Unnamed: 0,classifier,accuracy,precision,recall,f1
0,Decision Tree,1.0,1.0,1.0,1.0
1,KNN,0.953488,0.476744,0.5,0.488095
2,SVC,0.953488,0.476744,0.5,0.488095
3,Random Forest,0.988372,0.993976,0.875,0.925541
4,Naive Bayes,1.0,1.0,1.0,1.0
5,Logistic Regression,1.0,1.0,1.0,1.0
