# Read dataset

In [7]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix

## Read dataset from directory
dir_data = './data/'
raw_data = os.path.join(dir_data, 'crx.data')
data = np.genfromtxt(raw_data, delimiter=",", dtype=str)
label = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']

# Put txt files into DataFrame

In [8]:
arrange_data = []
for line in data:
    # processing with missimg value
    for index in range(len(line)):
        if(line[index] == '?'):
            line[index] = '0'  # replace missing value with 0
    arrange_data.append(line)
    
df = pd.DataFrame(arrange_data)
df.columns = label
df=df.astype({'A2':'float32',
              'A3':'float32',
              'A8':'float32',})
df

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.830000,0.000,u,g,w,v,1.250,t,t,01,f,g,00202,0,+
1,a,58.669998,4.460,u,g,q,h,3.040,t,t,06,f,g,00043,560,+
2,a,24.500000,0.500,u,g,q,h,1.500,t,f,0,f,g,00280,824,+
3,b,27.830000,1.540,u,g,w,v,3.750,t,t,05,t,g,00100,3,+
4,b,20.170000,5.625,u,g,w,v,1.710,t,f,0,f,s,00120,0,+
5,b,32.080002,4.000,u,g,m,v,2.500,t,f,0,t,g,00360,0,+
6,b,33.169998,1.040,u,g,r,h,6.500,t,f,0,t,g,00164,31285,+
7,a,22.920000,11.585,u,g,cc,v,0.040,t,f,0,f,g,00080,1349,+
8,b,54.419998,0.500,y,p,k,h,3.960,t,f,0,f,g,00180,314,+
9,b,42.500000,4.915,y,p,w,v,3.165,t,f,0,t,g,00052,1442,+


# Split training data & testing data

In [12]:
crx_data = df[['A2', 'A3', 'A8', 'A11', 'A14', 'A15']]
crx_label = df['A16']
train_data , test_data , train_label , test_label = train_test_split(crx_data, crx_label, test_size=0.33, stratify=crx_label)
train_data

Unnamed: 0,A2,A3,A8,A11,A14,A15
46,41.000000,2.040000,0.125,23,00455,1236
545,44.250000,11.000000,1.500,0,00000,0
11,29.920000,1.835000,4.335,0,00260,200
218,53.919998,9.625000,8.665,05,00000,0
161,44.000000,2.000000,1.750,02,00000,15
261,52.169998,0.000000,0.000,0,00000,0
283,16.500000,1.250000,0.250,01,00108,98
611,27.580000,3.250000,5.085,02,00369,1
69,35.169998,25.125000,1.625,01,00515,500
403,22.670000,0.335000,0.750,0,00160,0


# KNN

In [13]:
knn = KNeighborsClassifier()
knn.fit(train_data,train_label)
pred = knn.predict(test_data)

# Confusion matrix

In [14]:
print(confusion_matrix(test_label,pred))

[[68 33]
 [28 99]]


# Prediction

In [15]:
print(classification_report(test_label,pred))

              precision    recall  f1-score   support

           +       0.71      0.67      0.69       101
           -       0.75      0.78      0.76       127

   micro avg       0.73      0.73      0.73       228
   macro avg       0.73      0.73      0.73       228
weighted avg       0.73      0.73      0.73       228

