In [54]:
import numpy as np
import math
from scipy.stats import mode
np.set_printoptions(precision=5, suppress=True)


## Dataset

In [55]:
dataset = np.array(
        [["Movie Title",                   "kicks",  "kisses",    "movie type"],
        ["California Man",                     3,         104,       "Romance"],
        ["He's not reall into dudes",          2,         100,       "Romance"],
        ["Beautiful Woman",                    1,          81,       "Romance"],
        ["Kevin Longblade",                  101,          10,        "Action"],
        ["Robo Slayer 3000",                  99,           5,        "Action"],
        ["Amped",                             98,           2,        "Action"]]
);


## Functions

In [56]:
def normalize_column(col_):
  min_ = col_.min(axis=0)
  max_ = col_.max(axis=0)
  return (col_ - min_) / (max_ - min_)

def normalize_columns(data):
    for col in range(data.shape[1]):
        data[:, col] = normalize_column(data[:,col])
    return data

# Concatenate and normalize the arguments.
#
# Both args must have the same number of columns.
# This function will concatenate both arguments (axis=0) and normalizes the concatenated data.
#
# Arguments:
# - points as (multi dimensional) array
# - a single point as one dimensional array
#
# Return:
# A tuple with the normalized points and the normalized point.
#
def normalize(points_, point_ ):
    data_ = np.concatenate((points_, point_), axis=0)
    data_ = normalize_columns(data_)
    points_norm = data_[:-1] # all rows but last row
    point_norm  = data_[-1:] # only last row
    return points_norm, point_norm

# Calculate the square-root distance between all Xi's and the actual point
# Arguments:
# - Xi's
# - actual point
# Return:
# A array with the calculated distances
#
def calc_distance(points_, custom_point_):
    distance_ = np.array([], dtype=float)
    for row_ in range(points_.shape[0]):
        dist = .0
        for col_ in range(points_.shape[1]):
            dist += (points_[row_,col_] - custom_point_[0,col_]) ** 2
        dist = math.sqrt(dist)
        distance_ = np.append(distance_, dist)
    return distance_

## Normalization
Normalize the points in the dataset AND the point we want to categorize.

In [57]:
# the point we want to categorize
custom_point = [[18,90]]
# extract the features from our labled dataset
features     = dataset[1:, [1,2]].astype(float)
# normalize all datapoints
features_norm, custom_point_norm = normalize(features, custom_point )
print("Normalized features: \n\n", features_norm,    "\n\n")
print("Normalized point:    \n\n", custom_point_norm)

Normalized features: 

 [[0.02    1.     ]
 [0.01    0.96078]
 [0.      0.77451]
 [1.      0.07843]
 [0.98    0.02941]
 [0.97    0.     ]] 


Normalized point:    

 [[0.17    0.86275]]


## Calculate distances

In [58]:
distance = calc_distance(features_norm, custom_point_norm)
print("distances: ", distance)
# add distance column to the dataset
distance_col = np.append([["Distance"]], distance).reshape((-1,1))
dataset = np.hstack((dataset, distance_col))
dataset

distances:  [0.20332 0.18765 0.19153 1.14195 1.16213 1.17658]


array([['Movie Title', 'kicks', 'kisses', 'movie type', 'Distance'],
       ['California Man', '3', '104', 'Romance', '0.20331971894596085'],
       ["He's not reall into dudes", '2', '100', 'Romance',
        '0.18764777593240972'],
       ['Beautiful Woman', '1', '81', 'Romance', '0.19153450636380823'],
       ['Kevin Longblade', '101', '10', 'Action', '1.141949219533124'],
       ['Robo Slayer 3000', '99', '5', 'Action', '1.162129271830137'],
       ['Amped', '98', '2', 'Action', '1.176575158751321']], dtype='<U32')

## Sort by distance ascending

In [59]:
headline = dataset[0]
data = dataset[1:]
data = data[np.argsort(data[:,-1])]
dataset = np.vstack((headline, data))
dataset

array([['Movie Title', 'kicks', 'kisses', 'movie type', 'Distance'],
       ["He's not reall into dudes", '2', '100', 'Romance',
        '0.18764777593240972'],
       ['Beautiful Woman', '1', '81', 'Romance', '0.19153450636380823'],
       ['California Man', '3', '104', 'Romance', '0.20331971894596085'],
       ['Kevin Longblade', '101', '10', 'Action', '1.141949219533124'],
       ['Robo Slayer 3000', '99', '5', 'Action', '1.162129271830137'],
       ['Amped', '98', '2', 'Action', '1.176575158751321']], dtype='<U32')

## Calculate k
by getting the square root of the row count and truncate the decimal part.
But k can't be even because we need a majority, so add 1 if needed.

In [60]:
k = int(math.sqrt(np.size(data,0)))
if k % 2 == 0:
    k += 1
print("Value of k is: ", k)

Value of k is:  3


## Find the majority class for k sorted least distances

In [61]:
majority_class = mode(data[:k,-2])[0][0]
print("majority_class: ", majority_class)




majority_class:  Romance
