# KNN para classificar se uma pessoa possui doença cardíaca
Foi utilizado este dataset https://archive.ics.uci.edu/ml/datasets/Heart+Disease

In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import math

In [2]:
# Reading the csv
data = pd.read_csv('heart.csv')
data['age'] = data['age'].astype(float)
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37.0,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41.0,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56.0,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57.0,0,0,120,354,0,1,163,1,0.6,2,0,2,1


## Normalizando os atributos numéricos

In [3]:
from sklearn.preprocessing import MinMaxScaler

In [4]:
scaler = MinMaxScaler()

In [5]:
df = data.iloc[:,:].values.astype(float)

In [6]:
scaled = scaler.fit_transform(df)

In [7]:
df_scaled = x_train_normalized = pd.DataFrame(scaled,columns=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach','exang', 'oldpeak', 'slope', 'ca', 'thal','target'])

In [8]:
df_scaled.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.708333,1.0,1.0,0.481132,0.244292,1.0,0.0,0.603053,0.0,0.370968,0.0,0.0,0.333333,1.0
1,0.166667,1.0,0.666667,0.339623,0.283105,0.0,0.5,0.885496,0.0,0.564516,0.0,0.0,0.666667,1.0
2,0.25,0.0,0.333333,0.339623,0.178082,0.0,0.0,0.770992,0.0,0.225806,1.0,0.0,0.666667,1.0
3,0.5625,1.0,0.333333,0.245283,0.251142,0.0,0.5,0.816794,0.0,0.129032,1.0,0.0,0.666667,1.0
4,0.583333,0.0,0.0,0.245283,0.520548,0.0,0.5,0.70229,1.0,0.096774,1.0,0.0,0.666667,1.0


## Separando dados entre treino e teste

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,0:13], data.iloc[:,13], test_size = 0.2, shuffle = True, stratify = data['target'])

## Criando o algoritmo

In [10]:
class element():
    def __init__(self,target,value):
        self.target = target
        self.value = value

In [14]:
def sim_list(data,real,target):
    similarity_list = []
    
    for index, row in data.iterrows():
        a = abs(row['age'] - target['age'])
        b = abs(row['sex'] - target['sex'])
        d = abs(row['trestbps'] - target['trestbps'])
        e = abs(row['chol'] - target['chol'])
        f = abs(row['fbs'] - target['fbs'])
        h = abs(row['thalach'] - target['thalach'])
        i = abs(row['exang'] - target['exang'])
        j = abs(row['oldpeak'] - target['oldpeak'])
        l = abs(row['ca'] - target['ca'])
        
        value = (a*a)+(b*b)+(d*d)+(e*e)+(f*f)+(h*h)+(i*i)+(j*j)+(l*l)
        value = math.sqrt(value)
        
        x = real.loc[index]
        
        similarity_list.append(element(x,value))
        
    return similarity_list

In [15]:
def knn(k,x_train,y_train,x_test):
    
    if k%2 == 0:
        k += 1
    
    predictions = []
    
    for index, row in x_test.iterrows():
        
        similarity_list = sim_list(x_train,y_train,row)
        
        similarity_list.sort(key=lambda x: x.value, reverse=True)
        
        x = 0.0
        
        for i in range(k):
            pred = similarity_list[k].target
            if pred == 1:
                x +=1
            else:
                x -= 1
        
        if x>0:
            x  = 1.0
        
        else:
            x = 0.0
                
        predictions.append((x,index))
        
    return predictions

In [95]:
predictions = knn(3,x_train,y_train,x_test)
predictions

[(1.0, 200),
 (0.0, 278),
 (0.0, 190),
 (1.0, 269),
 (1.0, 94),
 (0.0, 123),
 (1.0, 205),
 (1.0, 74),
 (1.0, 282),
 (0.0, 93),
 (0.0, 288),
 (0.0, 167),
 (0.0, 85),
 (0.0, 182),
 (1.0, 268),
 (0.0, 4),
 (1.0, 47),
 (1.0, 184),
 (1.0, 133),
 (1.0, 18),
 (1.0, 15),
 (1.0, 108),
 (1.0, 146),
 (0.0, 222),
 (0.0, 73),
 (1.0, 166),
 (0.0, 299),
 (1.0, 302),
 (1.0, 261),
 (0.0, 40),
 (1.0, 275),
 (1.0, 34),
 (1.0, 203),
 (0.0, 262),
 (1.0, 49),
 (0.0, 240),
 (1.0, 125),
 (1.0, 67),
 (1.0, 55),
 (1.0, 199),
 (1.0, 280),
 (1.0, 114),
 (0.0, 12),
 (1.0, 21),
 (1.0, 128),
 (1.0, 151),
 (0.0, 248),
 (1.0, 211),
 (0.0, 60),
 (1.0, 80),
 (1.0, 120),
 (0.0, 179),
 (0.0, 236),
 (1.0, 256),
 (1.0, 249),
 (1.0, 27),
 (1.0, 137),
 (1.0, 157),
 (1.0, 26),
 (1.0, 124),
 (1.0, 126)]

## Avaliando o algoritmo

In [96]:
tp = 0
fp = 0
tn = 0
fn = 0
count = 0

In [97]:
for member in predictions:
    index = member[1]
    real = y_test.loc[index]
    predicted = member[0]
    
    if real == 1:
        if real == predicted:
            tp += 1
        else:
            fn += 1
            
    else:
        if real == predicted:
            tn += 1
        else:
            fp += 1
            
    count += 1        

In [98]:
acertos = tp + tn
acertos

37

In [107]:
print('Acurácia:', acertos/count)

Acurácia: 0.6065573770491803


In [108]:
print('Recall:', tp/(tp+fn))

Recall: 0.7575757575757576


In [None]:
print('Precisão:' tp/(tp+tn))