# Importações

- Importação das bibliotecas que utilizaremos no case 

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

- Criação do dataframe e inserção do nome das colunas

In [2]:
df = pd.read_csv("breast-cancer.data",  names= ['Class', 
                                                'Age', 
                                                'Menopause',
                                                'Tumor_size', 
                                                'Inv_nodes', 
                                                'Node_caps', 
                                                'Deg_malig', 
                                                'Breast', 
                                                'Breast_quad', 
                                                'Irradiant'])

# Tratamento dos dados

- Ordenamos o datframe por idade reorganizando as colunas para que a amostra de treino tenha tanto recorrencia e não-recorrencia

In [3]:
df = df.sort_values('Inv_nodes', ascending= False)
df

Unnamed: 0,Class,Age,Menopause,Tumor_size,Inv_nodes,Node_caps,Deg_malig,Breast,Breast_quad,Irradiant
130,no-recurrence-events,40-49,premeno,35-39,9-11,yes,2,right,right_up,yes
184,no-recurrence-events,50-59,ge40,30-34,9-11,?,3,left,left_low,yes
183,no-recurrence-events,50-59,ge40,30-34,9-11,?,3,left,left_up,yes
230,recurrence-events,50-59,premeno,50-54,9-11,yes,2,right,left_up,no
233,recurrence-events,70-79,ge40,15-19,9-11,?,1,left,left_low,yes
...,...,...,...,...,...,...,...,...,...,...
97,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
98,no-recurrence-events,40-49,premeno,30-34,0-2,no,1,left,right_up,no
99,no-recurrence-events,30-39,premeno,25-29,0-2,no,2,left,left_low,no
100,no-recurrence-events,40-49,ge40,20-24,0-2,no,3,left,left_low,no


- Foi feita uma contagem nos lados em que os tumores ocorreram para entender se ouve alguma tendencia

In [4]:
valores_interrog = []
for coluna in df:
  contagem_interrog = len(df.loc[df[coluna]== 'right'])
  valores_interrog.append(contagem_interrog)

valores_interrog

[0, 0, 0, 0, 0, 0, 0, 134, 0, 0]

- Procuramos valores NaN no dataframe


In [5]:
valores_nan = []
for coluna in df:
  contagem_nan = len(df.loc[df[coluna]== np.nan])
  valores_nan.append(contagem_nan)
valores_nan

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

- Procuramos valores Null no dataframe


In [6]:
valores_none = []
for coluna in df:
  contagem_none = len(df.loc[df[coluna]== None])
  valores_none.append(contagem_none)
valores_none

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

- Fizemos uma varredura nas colunas do dataframe procurando por '?'

In [7]:
valores_interrog = []
for coluna in df:
  contagem_interrog = len(df.loc[df[coluna]== '?'])
  valores_interrog.append(contagem_interrog)
valores_interrog

[0, 0, 0, 0, 0, 8, 0, 0, 1, 0]

- Como obtemos apenas 9 linhas com valores perdidos, decidimos descartar essas linhas, tranformamos os '?' em NaN para facilitar a eliminação das linhas

In [8]:
df.drop(df.loc[df['Node_caps']=='?'].index, inplace=True)
df.drop(df.loc[df['Breast_quad']=='?'].index, inplace=True)

- Verificamos se os valores '?' realmente foram removidos


In [9]:
valores_interrog = []
for coluna in df:
  contagem_interrog = len(df.loc[df[coluna]== '?'])
  valores_interrog.append(contagem_interrog)

valores_interrog

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [10]:
df.tail(25)

Unnamed: 0,Class,Age,Menopause,Tumor_size,Inv_nodes,Node_caps,Deg_malig,Breast,Breast_quad,Irradiant
77,no-recurrence-events,30-39,premeno,25-29,0-2,no,1,left,central,no
78,no-recurrence-events,50-59,premeno,25-29,0-2,no,2,left,left_low,no
79,no-recurrence-events,40-49,premeno,25-29,0-2,no,2,right,central,no
80,no-recurrence-events,50-59,ge40,10-14,0-2,no,2,right,left_low,no
81,no-recurrence-events,60-69,ge40,10-14,0-2,no,1,left,left_up,no
82,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_low,no
83,no-recurrence-events,50-59,ge40,15-19,0-2,no,2,right,left_low,no
84,no-recurrence-events,40-49,premeno,20-24,0-2,no,1,left,right_low,no
86,no-recurrence-events,60-69,ge40,25-29,0-2,no,2,right,left_low,no
101,no-recurrence-events,50-59,ge40,30-34,0-2,no,3,right,left_low,no


# Análise

- Utilizamos a função unique para verificar se os valores das colunas são contínuos ou categóricos


In [11]:
df['Class'].unique()   

array(['no-recurrence-events', 'recurrence-events'], dtype=object)

In [12]:
df['Age'].unique()

array(['40-49', '50-59', '30-39', '60-69', '70-79', '20-29'], dtype=object)

In [13]:
df['Menopause'].unique()

array(['premeno', 'ge40', 'lt40'], dtype=object)

In [14]:
df['Tumor_size'].unique()

array(['35-39', '50-54', '30-34', '25-29', '15-19', '20-24', '40-44',
       '10-14', '45-49', '0-4', '5-9'], dtype=object)

In [15]:
df['Inv_nodes'].unique()

array(['9-11', '6-8', '3-5', '24-26', '15-17', '12-14', '0-2'],
      dtype=object)

In [16]:
df['Node_caps'].unique()

array(['yes', 'no'], dtype=object)

In [17]:
df['Deg_malig'].unique()

array([2, 3, 1])

In [18]:
df['Breast'].unique()

array(['right', 'left'], dtype=object)

In [19]:
df['Breast_quad'].unique()

array(['right_up', 'left_up', 'left_low', 'right_low', 'central'],
      dtype=object)

In [20]:
df['Irradiant'].unique()

array(['yes', 'no'], dtype=object)

- Criamos a função one_hot para transformar os valores categóricos em vetores


In [21]:
def one_hot(categ, valor):
  categ = list(set(categ))
  indice = categ.index(valor)
  vetor = np.zeros(len(categ))
  vetor[indice] = 1.0
  return vetor

- Utilizamos a função one hot em cada uma das colunas, pois todas possuem dados categóricos 


In [23]:
df.Class = [ one_hot(df.Class, _) for _ in df.Class ]

df.Age = [ one_hot(df.Age, _) for _ in df.Age ]

df.Menopause = [ one_hot(df.Menopause, _) for _ in df.Menopause ]

df.Tumor_size = [ one_hot(df.Tumor_size, _) for _ in df.Tumor_size ]

df.Inv_nodes = [ one_hot(df.Inv_nodes, _) for _ in df.Inv_nodes ]

df.Node_caps = [ one_hot(df.Node_caps, _) for _ in df.Node_caps ]

df.Deg_malig = [ one_hot(df.Deg_malig, _) for _ in df.Deg_malig ]

df.Breast = [ one_hot(df.Breast, _) for _ in df.Breast ]

df.Breast_quad = [ one_hot(df.Breast_quad, _) for _ in df.Breast_quad ]

df.Irradiant = [ one_hot(df.Irradiant, _) for _ in df.Irradiant ]

In [24]:
df.head(10)

Unnamed: 0,Class,Age,Menopause,Tumor_size,Inv_nodes,Node_caps,Deg_malig,Breast,Breast_quad,Irradiant
130,"[0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 1.0]"
230,"[1.0, 0.0]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[1.0, 0.0]"
129,"[0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"
245,"[1.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[1.0, 0.0]","[0.0, 1.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"
265,"[1.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 1.0]","[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 1.0]","[1.0, 0.0]"
275,"[1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 1.0]","[0.0, 1.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[1.0, 0.0]"
247,"[1.0, 0.0]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 1.0]","[1.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"
126,"[0.0, 1.0]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 0.0, 1.0, 0.0]","[1.0, 0.0]"
127,"[0.0, 1.0]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"
258,"[1.0, 0.0]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"


- Retiramos a coluna "seio", pois definimos que não é relevante na análise


In [25]:
df = df.drop(columns='Breast')

In [26]:
df.head()

Unnamed: 0,Class,Age,Menopause,Tumor_size,Inv_nodes,Node_caps,Deg_malig,Breast_quad,Irradiant
130,"[0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 1.0]"
230,"[1.0, 0.0]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[1.0, 0.0]"
129,"[0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"
245,"[1.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[1.0, 0.0]","[0.0, 1.0, 0.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"
265,"[1.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 1.0]","[1.0, 0.0]"


- Criamos um novo dataframe com a coluna classe e descartamos essa coluna do dataframe original


In [27]:
labels = df.Class.values
df = df.drop(columns= 'Class')
df.head(10)

Unnamed: 0,Age,Menopause,Tumor_size,Inv_nodes,Node_caps,Deg_malig,Breast_quad,Irradiant
130,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 1.0]"
230,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[1.0, 0.0]"
129,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"
245,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[1.0, 0.0]","[0.0, 1.0, 0.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"
265,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 1.0]","[1.0, 0.0]"
275,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 1.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[1.0, 0.0]"
247,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 0.0, 1.0]","[1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"
126,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]","[1.0, 0.0]"
127,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"
258,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0]","[0.0, 1.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0]"


- Criamos a função de vetorizar para transformar os valores que passaram a ser listas depois da função one hot em vetor para podermos analisar

In [28]:
def vetorizar(Age, Menopause, Tumor_size, Inv_nodes, Node_caps, Deg_malig, Breast_quad, Irradiant):
    return Age.tolist() + Menopause.tolist() + Tumor_size.tolist() + Inv_nodes.tolist() + Node_caps.tolist() \
    + Deg_malig.tolist() + Breast_quad.tolist() + Irradiant.tolist()

- Aplicamos a vetorização do datafame


In [29]:
entrada = [vetorizar(*_) for _ in df.values]

- Salvamos o vetor em um arquivo csv


In [30]:
np.savetxt("/home/entrada_treino.csv", entrada, delimiter=',')

- Salvamos o dataframe labels como uma lista em um arquivo csv


In [31]:
np.savetxt("/home/labels.csv", labels.tolist(), delimiter=',')

- Carregamos o vetor de entrada na variavel x

In [32]:
x = np.loadtxt("/home/entrada_treino.csv", delimiter = ',')

- Carregamos a lista labels na variavel y


In [33]:
y = np.loadtxt("/home/labels.csv", delimiter = ',')

- Calculamos o número de vizinhos para aplicar no KNN


In [34]:
num_viz = int(round((len(df.index))**(1/2),0))
num_viz

17

- Inserimos na variavel modelo o KNeighborsClassifier com o número de vizinhos calcualdo


In [35]:
modelo = KNeighborsClassifier(n_neighbors= num_viz)

- Definimos o tamanho da amostra de treino em 75%


In [36]:
vol_treino = int(x.shape[0]*0.75)

- Separamos as amostras de treino e de teste sendo _t para amostras de treino e _v para amostras de teste

In [37]:
x_t = x[ :vol_treino]
y_t = y[ :vol_treino]

x_v = x[vol_treino: ]
y_v = y[vol_treino: ]

- Aplicamos o KNN nas amostras de treino


In [38]:
modelo.fit(x_t, y_t)

KNeighborsClassifier(n_neighbors=17)

- Aplicamos o KNN nas amostras de teste e verificando a eficácia do modelo


In [39]:
efic_modelo = modelo.score(x_v, y_v)
print(f'A eficácia do modelo aplicado é de {round(efic_modelo, 4)*100}%')

A eficácia do modelo aplicado é de 98.57000000000001%
