# Padronização (Z-Score) e k-NN

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('credit_data.csv')

In [3]:
dataset.dropna(inplace=True)

In [4]:
dataset.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


## k-NN Sem padronização

In [5]:
X = dataset.iloc[:, 1:4].values
X

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [7]:
y = dataset['c#default'].values
y

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [13]:
np.mean(X_train[0]), np.median(X_train[0]), np.std(X_train[0]) # Observando métricas do salario na base de treinamento

(13980.843594187325, 6361.97343810153, 15474.364889896373)

In [14]:
np.mean(X_test[0]), np.median(X_test[0]), np.std(X_test[0])  # Observando métricas do salario na base de teste

(17386.93209227748, 5671.45055982651, 20662.087938692024)

In [15]:
knn = KNeighborsClassifier()

In [16]:
knn.fit(X_train, y_train)

In [17]:
previsao = knn.predict(X_test)

In [19]:
accuracy_score(y_test, previsao) # Acurácia do modelo

0.83

## k-NN com padronização

In [20]:
from sklearn.preprocessing import StandardScaler # Aplica o cálculo da padronização Z-score

In [21]:
z_score_treinamento = StandardScaler()
z_score_teste = StandardScaler()

In [22]:
X_train_p = z_score_treinamento.fit_transform(X_train)
X_test_p = z_score_teste.fit_transform(X_test)

In [24]:
X_train_p, X_test_p # Valores padronizados na base de dados de treinamento e teste

(array([[-0.66082947, -1.25041953,  0.62475176],
        [ 0.7435586 , -1.09753504,  0.38674164],
        [-1.7130126 , -0.37672635, -1.23173314],
        ...,
        [ 0.68874615,  1.08860545,  1.59130831],
        [ 0.00388944, -0.21690201,  0.3555415 ],
        [ 0.28333908, -1.18861029, -0.34095483]]),
 array([[ 0.01446334,  1.58170762,  0.41285843],
        [ 0.1302742 ,  0.71568899,  0.56578624],
        [-1.61948216,  0.78539722,  0.05522585],
        ...,
        [ 1.4801205 ,  0.59012887, -0.46674275],
        [-0.0905643 ,  0.83047036,  0.04238753],
        [-1.60254219, -1.26989512, -0.80304803]]))

In [25]:
min(X_train_p[0]), max(X_train_p[0])

(-1.2504195330011658, 0.6247517591196471)

In [26]:
np.mean(X_train_p[0]), np.median(X_train_p[0]), np.std(X_train_p[0])  

(-0.42883241546220835, -0.6608294725051065, 0.7829149888103791)

In [27]:
np.mean(X_test_p[0]), np.median(X_test_p[0]), np.std(X_test_p[0])  

(0.6696764641618066, 0.41285843374560083, 0.6650966295385563)

In [28]:
knn = KNeighborsClassifier()

In [29]:
knn.fit(X_train_p, y_train)

In [30]:
previsoes_p = knn.predict(X_test_p)

In [31]:
accuracy_score(y_test, previsoes_p)

0.9825

 Percebe-se que o modelo aumentou a sua acurácia. 

OBS: É importante ressaltar que nem sempre a acurácia é a melhor métrica para determinação se o modelo obteve uma boa performance.