### KNN에서 k값 설정

In [1]:
k = 3

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
(df['Outcome']==0).sum()

500

In [5]:
(df['Outcome']==1).sum()

268

# 불균형데이터로 KNN

In [6]:
from sklearn import preprocessing

x_before = df.values.astype(float)
min_max_scaler = preprocessing.MinMaxScaler()
x_after = min_max_scaler.fit_transform(x_before)
df = pd.DataFrame(x_after, columns=df.columns)
df['Outcome'] = df['Outcome'].astype(int)
df['Outcome'] = df['Outcome'].astype(int)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.000000,0.500745,0.234415,0.483333,1
1,0.058824,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667,0
2,0.470588,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.183333,1
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000,0
4,0.000000,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000,1
...,...,...,...,...,...,...,...,...,...
763,0.588235,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000,0
764,0.117647,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000,0
765,0.294118,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000,0
766,0.058824,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333,1


In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)
df_train['Outcome'] = df_train['Outcome'].astype(int)
df_test['Outcome'] = df_test['Outcome'].astype(int)

x_train = df_train.drop('Outcome',axis=1)
y_train = df_train['Outcome'].values
x_test = df_test.drop('Outcome',axis=1)
y_test = df_test['Outcome'].values

knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(x_train,y_train)

y_pred = knn.predict(x_test)
print('accuracy: %.2f%%' %(metrics.accuracy_score(y_test, y_pred)*100))
print('precision: %.2f%%' %(metrics.precision_score(y_test, y_pred)*100))
print('recall: %.2f%%' %(metrics.recall_score(y_test, y_pred)*100))

accuracy: 62.34%
precision: 46.67%
recall: 51.85%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


# SMOTE-ENN방식

## 1. (Start of SMOTE) Choose random data from the minority class.
## 2. Calculate the distance between the random data and its k nearest neighbors.
## 3. Multiply the difference with a random number between 0 and 1, then add the result to the minority class as a synthetic sample.
## 4. Repeat step number 2–3 until the desired proportion of minority class is met. (End of SMOTE)

In [8]:
minority = df_train[df_train['Outcome']==1].copy()
minority

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
192,0.411765,0.798995,0.540984,0.000000,0.000000,0.453055,0.130231,0.250000,1
448,0.000000,0.522613,0.524590,0.373737,0.075650,0.500745,0.184458,0.016667,1
748,0.176471,0.939698,0.573770,0.222222,0.236407,0.542474,0.140905,0.250000,1
155,0.411765,0.763819,0.721311,0.444444,0.000000,0.745156,0.110589,0.250000,1
309,0.117647,0.623116,0.557377,0.282828,0.242317,0.490313,0.340307,0.150000,1
...,...,...,...,...,...,...,...,...,...
458,0.588235,0.743719,0.688525,0.484848,0.280142,0.560358,0.394108,0.500000,1
214,0.529412,0.562814,0.672131,0.323232,0.206856,0.509687,0.077711,0.250000,1
614,0.647059,0.693467,0.606557,0.262626,0.170213,0.538003,0.204526,0.483333,1
270,0.588235,0.507538,0.704918,0.373737,0.000000,0.679583,0.451751,0.283333,1


In [9]:
n = (df_train['Outcome']==0).sum() - (df_train['Outcome']==1).sum()

minority_feature_columns = minority.columns[:-1]

count = 0
    
while True:
    random_index = np.random.randint(0,767)
    # 랜덤으로 하나의 인덱스를 뽑는다.
    
    if random_index not in minority.index:
        continue
    # 만약 random_index가 minority라는 DataFrame에 없으면 다시 뽑는다.
        
    random_index_value = minority[minority_feature_columns].loc[random_index,:].values
    # 랜덤으로 뽑은 인덱스의 값을 받는다.
    
    nn_index = knn.kneighbors([minority[minority_feature_columns].loc[random_index,:].values])[1][0][1]
    # 랜덤으로 뽑은 인덱스의 값과 가장 가까운 인덱스를 받는다.
    
    if nn_index not in minority.index:
        continue
    # 만약 nn_index가 minority라는 DataFrame에 없으면 다시 뽑는다.
    
    nn_index_value = minority[minority_feature_columns].loc[nn_index,:].values
    ## 가장 가까운 인덱스의 값을 받는다.
    
    distance = nn_index_value - random_index_value
    ## 거리를 구한다.(diff라고 많이 하더라.)
    
    new_index_value = random_index_value + np.random.rand()*distance
    # oversampling 값을 뽑는다.
    
    new_index_value = np.append(new_index_value, 1)
    # oversampling 값에 마지막에 1(양성)을 추가시킨다.
    
    new_index = np.max(df_train.index) + 1 # 수정 완료
    # new_index는 매 step당 +2씩 증가 => step이 +2씩 증가하는 경우가 있었는데, 그렇게 되어지지 않도록 코드를 아예 수정했습니다.
    # new_index는 기존의 df_train의 index(data)를 덮어 씌워 raw data가 훼손됨 => df_train의 index의 max값 다음부터 추가하도록 수정하여 문제 해결했습니다.
    # oversapling 값에 index 번호를 매긴다.
    
    df_train.loc[new_index,:] = new_index_value
    # DataFrame에 oversampling된 값을 넣는다.
    
    count += 1
    
    if count == n:
        break
    # 양성데이터와 음성데이터가 1:1이 될 때까지 돌린다.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
(df_train['Outcome']==0).sum()

450

In [11]:
(df_train['Outcome']==1).sum()

450

In [12]:
x_train = df_train.drop('Outcome',axis=1)
y_train = df_train['Outcome'].values

knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(x_train,y_train)

y_pred = knn.predict(x_test)
print('accuracy: %.2f%%' %(metrics.accuracy_score(y_test, y_pred)*100))
print('precision: %.2f%%' %(metrics.precision_score(y_test, y_pred)*100))
print('recall: %.2f%%' %(metrics.recall_score(y_test, y_pred)*100))

accuracy: 62.34%
precision: 47.50%
recall: 70.37%


## 5.(Start of ENN) Determine K, as the number of nearest neighbors. If not determined, then K=3.
## 6. Find the K-nearest neighbor of the observation among the other observations in the dataset, then return the majority class from the K-nearest neighbor.
## 7. If the class of the observation and the majority class from the observation’s K-nearest neighbor is different, then the observation and its K-nearest neighbor are deleted from the dataset.
## 8. Repeat step 2 and 3 until the desired proportion of each class is fulfilled. (End of ENN)

In [13]:
y_train_pred = knn.predict(x_train)
y_train_pred = y_train_pred.astype(int)

y_train_answer = df_train.loc[x_train.index]['Outcome'].values
y_train_answer = y_train_answer.astype(int)

issame = (y_train_pred == y_train_answer)

drop_index = x_train.index[np.where(issame==False)]

df_train = df_train.drop(index=drop_index, axis=0)
df_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
396,0.176471,0.482412,0.459016,0.343434,0.135934,0.368107,0.369769,0.300000,0.0
528,0.000000,0.587940,0.540984,0.313131,0.222222,0.459016,0.177199,0.016667,0.0
10,0.235294,0.552764,0.754098,0.000000,0.000000,0.560358,0.048249,0.150000,0.0
192,0.411765,0.798995,0.540984,0.000000,0.000000,0.453055,0.130231,0.250000,1.0
448,0.000000,0.522613,0.524590,0.373737,0.075650,0.500745,0.184458,0.016667,1.0
...,...,...,...,...,...,...,...,...,...
972,0.360039,0.716922,0.525643,0.000000,0.000000,0.560243,0.201885,0.560634,1.0
973,0.411765,0.544695,0.500667,0.249446,0.014653,0.401667,0.101179,0.140414,1.0
974,0.338521,0.663870,0.733156,0.000000,0.000000,0.677379,0.105679,0.133441,1.0
975,0.132444,0.960876,0.549028,0.396105,0.538072,0.463543,0.080881,0.462061,1.0


## 결과물에서 unbalanced가 해결되지 않음 => 500:268로 unbalance하다고 의견주셨는데, 반복적으로 돌려가며 확인한 결과 아래와 같이 after enn도 양성과 음성의 개수가 balance하게 나타납니다.

In [14]:
(df_train['Outcome']==0).sum()

384

In [15]:
(df_train['Outcome']==1).sum()

415

In [16]:
x_train = df_train.drop('Outcome',axis=1)
y_train = df_train['Outcome'].values

knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(x_train,y_train)

y_pred = knn.predict(x_test)
print('accuracy: %.2f%%' %(metrics.accuracy_score(y_test, y_pred)*100))
print('precision: %.2f%%' %(metrics.precision_score(y_test, y_pred)*100))
print('recall: %.2f%%' %(metrics.recall_score(y_test, y_pred)*100))

accuracy: 61.04%
precision: 46.51%
recall: 74.07%


## 결과
### 불균형데이터 -> SMOTE -> SMOTE+ENN

accuracy : 62.34% -> 59.74% -> 62.34%  
precision: 46.67% -> 44.74% -> 47.50%  
recall: 51.85% -> 62.96% -> 70.37%  

여러번 돌려본 결과  
accuracy : 60초반 -> 50후반 -> 60초반  
precision: 40중반 -> 40중반 -> 40후반  
recall: 50초반 -> 60초반 -> 70초반

처음에 recall 값이 굉장히 낮았는데, 이는 소수 클래스 레이블을 올바르게 예측하기 위한 모델 성능이 충분하지 않음을 의미한다.  
불균형데이터 -> SMOTE -> SMOTE+ENN으로 갈 수록 recall 값이 크게크게 늘어난다.  
accuracy와 precision은 거의 변동이 없다.