In [1]:
import pandas as pd

'''
create a example dataset
''' 
# four features and a class that is two categories
outlook = ['Sunny','Sunny','Overcast','Overcast','Rain','Rain']
temp = ['Hi','Hi','Hi','Lo','Lo','Lo']
humid = ['Hi','Hi','Lo','Lo','Hi','Hi']
wind = ['No','Yes','No','Yes','No','Yes']
decision = ['NP','NP','P','P','P','NP']

# show table about dataset
table = {'Outlook':outlook,'Temperature':temp,'Humidity':humid,'Windy':wind,'Decision':decision}
df = pd.DataFrame(data=table)
df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Decision
0,Sunny,Hi,Hi,No,NP
1,Sunny,Hi,Hi,Yes,NP
2,Overcast,Hi,Lo,No,P
3,Overcast,Lo,Lo,Yes,P
4,Rain,Lo,Hi,No,P
5,Rain,Lo,Hi,Yes,NP


In [2]:
'''
preprocess the datasets
'''
from sklearn import preprocessing

# create labelEncoder
le = preprocessing.LabelEncoder()
# converting string labels into numbers
outlook_encoded= le.fit_transform(outlook)
outlook_encoded= [-1 if ele==2 else ele for ele in outlook_encoded]
temp_encoded= le.fit_transform(temp)
humid_encoded= le.fit_transform(humid)
wind_encoded= le.fit_transform(wind)
label_encoded= le.fit_transform(decision)
print('outlook:',outlook_encoded) # sunny:2 overcast:0 rain:1
print('temp:',temp_encoded) # high:0 low:1
print('humid:',humid_encoded) # high:0 low:1
print('wind:',wind_encoded) # no:0 yes:1
print('decision:',label_encoded) # no play:0 play:1

outlook: [-1, -1, 0, 0, 1, 1]
temp: [0 0 0 1 1 1]
humid: [0 0 1 1 0 0]
wind: [0 1 0 1 0 1]
decision: [0 0 1 1 1 0]


In [3]:
import numpy as np
# combine features into a single list of tuples
features = np.stack((outlook_encoded,temp_encoded,humid_encoded,wind_encoded),axis=1)
print('features:',features)

features: [[-1  0  0  0]
 [-1  0  0  1]
 [ 0  0  1  0]
 [ 0  1  1  1]
 [ 1  1  0  0]
 [ 1  1  0  1]]


In [6]:
import math
from statistics import mode

# KNN function which is discrete version
def KNN_discrete_predict(train_set, label_set, predict_set, n_neighbors=2):
    distance = []
    better_neighbors = []
    # calculate distance that setting 1 represents difference; Otherwise, setting 0 is represents identical
    for eles in train_set:
        distance.append(sum([1 if eles[i]!=predict_set[i] else 0 for i in range(len(predict_set))]))
    # find the N nearest neighbors
    for i in range(n_neighbors):
        min_value = min(distance)
        min_index = distance.index(min_value)
        better_neighbors.append(min_index)
        distance[min_index]= math.inf
    print('better_neighbors:',better_neighbors)
    return mode([label_set[idx] for idx in better_neighbors])

# create KNN and predict the result
predicted = KNN_discrete_predict(features, label_encoded, [-1,0,1,0], 3) 

# No play:0 Play:1
if predicted==0:
    print('predicted: No Play')
else:
    print('predicted: Play')

better_neighbors: [0, 2, 1]
predicted: No Play
