# Put dataset into DataFrame

In [49]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

label = np.array(['ID', 'Outlook', 'Temperature', 'Humidity', 'Windy', 'Play'])
ID = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'A', 'B', 'C', 'D']
outlook = ['Sunny', 'Rainy', 'Sunny', 'Rainy', 'Sunny', 'Overcast', 'Rainy', 'Overcast', 'Rainy', 'Overcast', 'Sunny', 'Rainy', 'Overcast', 'Sunny']
temperatue = [85, 65, 72, 70, 69, 83, 75, 81, 68, 64, 80, 71, 72, 75]
humidity = [85, 70, 95, 96, 70, 86, 80, 75, 80, 65, 90, 91, 90, 70]
windy = ['F', 'Y', 'F', 'F', 'F', 'F', 'F', 'F', 'N', 'Y', 'Y', 'Y', 'Y', 'Y']
play = ['No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes']

data = {
    "ID": ID,
    "Outlook": outlook,
    "Temperature": temperatue,
    "Humidity": humidity,
    "Windy": windy,
    "Play": play
}

df = pd.DataFrame(data)
df.columns = label

df

Unnamed: 0,ID,Outlook,Temperature,Humidity,Windy,Play
0,1,Sunny,85,85,F,No
1,2,Rainy,65,70,Y,No
2,3,Sunny,72,95,F,No
3,4,Rainy,70,96,F,Yes
4,5,Sunny,69,70,F,Yes
5,6,Overcast,83,86,F,Yes
6,7,Rainy,75,80,F,Yes
7,8,Overcast,81,75,F,Yes
8,9,Rainy,68,80,N,Yes
9,10,Overcast,64,65,Y,Yes


# Split training data & testing data

In [50]:
train_data = df[df.columns[1:-1]].loc[0:9]
train_label = df['Play'].loc[0:9]
test_data = df[df.columns[1:-1]].loc[10:13]
test_label = df['Play'].loc[10:13]

# Convert dataframe into list
train_data = train_data.values.tolist()
train_label = train_label.values.tolist()
test_data = test_data.values.tolist()
test_label = test_label.values.tolist()

# KNN Algorithm(K = 1)

In [51]:
# Calculate the Jaccard similarity & Cosine similarity between two vectors
def jaccard_similarity(row1, row2):
    # Jaccard similarity
    s1 = set(row1[0:4:3])
    s2 = set(row2[0:4:3])
    jaccard = 1 - (len(s1.intersection(s2)) / len(s1.union(s2)))
    
    # Cosine similarity
    dot = np.dot(row1[1:3], row2[1:3])
    norma = np.linalg.norm(row1[1:3])
    normb = np.linalg.norm(row2[1:3])
    cosine = dot / (norma * normb)
    
    return jaccard * 0.5 + cosine * 0.5

# Locate the most similar neighbors
def get_neighbors(train_data, train_label, test_row, num_neighbors):
    distances = []
    for index in range(len(train_data)):
        dist = jaccard_similarity(test_row, train_data[index])
        distances.append((train_data[index], dist, train_label[index]))
    distances.sort(key=lambda tup: tup[1])
    neighbors = []
    for i in range(num_neighbors):
        neighbors.append([distances[i][0], distances[i][-1]])
    return neighbors

# Make a prediction with neighbors
def predict_classification(train_data, train_label, test_row, num_neighbors):
    neighbors = get_neighbors(train_data, train_label, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

# kNN Algorithm
def k_nearest_neighbors(train_data, train_label, test_data, num_neighbors):
    predictions = []
    for row in test_data:
        output = predict_classification(train_data, train_label, row, num_neighbors)
        predictions.append(output)
    return predictions

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# define model parameter
num_neighbors = 1

pred = k_nearest_neighbors(train_data, train_label, test_data, num_neighbors)
print("Accuracy: %.2f %%" % accuracy_metric(test_label, pred))
print("\nConfusion Matrix:")
print(confusion_matrix(test_label, pred))
print("\nClassification Report:")
print(classification_report(test_label, pred))
print("\nClassification Result: ")
print(pred)

Accuracy: 75.00 %

Confusion Matrix:
[[2 0]
 [1 1]]

Classification Report:
              precision    recall  f1-score   support

          No       0.67      1.00      0.80         2
         Yes       1.00      0.50      0.67         2

   micro avg       0.75      0.75      0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4


Classification Result: 
['No', 'No', 'Yes', 'No']


# KNN Algorithm(K = 3)

In [52]:
# Calculate the Jaccard similarity & Cosine similarity between two vectors
def jaccard_similarity(row1, row2):
    # Jaccard similarity
    s1 = set(row1[0:4:3])
    s2 = set(row2[0:4:3])
    jaccard = 1 - (len(s1.intersection(s2)) / len(s1.union(s2)))
    
    # Cosine similarity
    dot = np.dot(row1[1:3], row2[1:3])
    norma = np.linalg.norm(row1[1:3])
    normb = np.linalg.norm(row2[1:3])
    cosine = dot / (norma * normb)
    
    return jaccard * 0.5 + cosine * 0.5

# Locate the most similar neighbors
def get_neighbors(train_data, train_label, test_row, num_neighbors):
    distances = []
    for index in range(len(train_data)):
        dist = jaccard_similarity(test_row, train_data[index])
        distances.append((train_data[index], dist, train_label[index]))
    distances.sort(key=lambda tup: tup[1])
    neighbors = []
    for i in range(num_neighbors):
        neighbors.append([distances[i][0], distances[i][-1]])
    return neighbors

# Make a prediction with neighbors
def predict_classification(train_data, train_label, test_row, num_neighbors):
    neighbors = get_neighbors(train_data, train_label, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

# kNN Algorithm
def k_nearest_neighbors(train_data, train_label, test_data, num_neighbors):
    predictions = []
    for row in test_data:
        output = predict_classification(train_data, train_label, row, num_neighbors)
        predictions.append(output)
    return predictions

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# define model parameter
num_neighbors = 3

pred = k_nearest_neighbors(train_data, train_label, test_data, num_neighbors)
print("Accuracy: %.2f %%" % accuracy_metric(test_label, pred))
print("\nConfusion Matrix:")
print(confusion_matrix(test_label, pred))
print("\nClassification Report:")
print(classification_report(test_label, pred))
print("\nClassification Result: ")
print(pred)

Accuracy: 50.00 %

Confusion Matrix:
[[1 1]
 [1 1]]

Classification Report:
              precision    recall  f1-score   support

          No       0.50      0.50      0.50         2
         Yes       0.50      0.50      0.50         2

   micro avg       0.50      0.50      0.50         4
   macro avg       0.50      0.50      0.50         4
weighted avg       0.50      0.50      0.50         4


Classification Result: 
['No', 'Yes', 'Yes', 'No']
