In [27]:
# This is a binary KNN classifier originally designed to classify "Good" (1) or "Bad" (0)
# movies from imdb data
# For example, movies with high box office earnings, and high ratings were labeled (1),
# and vice versa for (0)


# Euclidean distance calculation
def euclidean_distance(pt1, pt2):
    distance = 0
    for i in range(len(pt1)):
        distance += (pt1[i] - pt2[i]) ** 2
    return distance ** 0.5

# KNN Classifiers require normalized data so that one feature doesn't dominate the
# others simply by virtue of relative integer sizes
def min_max_normalize(lst):
    minimum = min(lst)
    maximum = max(lst)
    normalized = []
    for i in lst:
        normalized_i = (i - minimum)/(maximum - minimum)
        normalized.append(normalized_i)
    return normalized

# This function will take an unlabeled point, and perform the binary classification dependent
# upon the known classification of its neighbors
def classify(unlabeled_point, dataset, labels, k):
    distances = []
# Compute the euclidean distance between the unknown point, and the rest of the dataset  
    for datapoint in dataset:    
        distance_to_point = euclidean_distance(dataset[datapoint], unlabeled_point)
# Append the calculated distances to the distance vector and sort them so the head
# contains all the closest distances
        distances.append([distance_to_point, datapoint])
    distances.sort()
# We have to select a large enough k to give the alogorithm a good understanding of 
# its neighboring data, but small enough that it doesn't associate with other data
    neighbors = distances[0:k]
    num_1 = 0
    num_0 = 0
# For each neighbor, we read the label to check if it's a 1 or a 0, and return the majority,
# ultimately enabling our classification of the unlabeled_point
    for neighbor in neighbors:
        datapoint = neighbor[1]
        if labels[datapoint] == 1:
            num_1 += 1
        elif labels[datapoint] == 0:
            num_0 += 1
    if num_1 > num_0:
        return "1"
    else:
        return "0"

# We can test the accuracy of our model by splitting the imdb data into training and test
# sets and labels

# find_test_accuracy checks each test set classification and compares it to the known test
# labels
def find_test_accuracy(training_set, training_labels, test_set, test_labels, k):
    num_correct = 0.0
    for i in test_set:
        guess = classify(test_set[i], training_set, training_labels, k)
        if guess == test_labels[i]:
            num_correct += 1
    return num_correct / len(test_set)

# get_k_vector obtains the test accuracy for a range of k values, allowing us to select the 
# k which produces the most accurate results.
def get_k_vector(k):
    k_vals = []
    for i in range(k):
        k_vals = np.append(k_vals, find_test_accuracy(training_set, training_labels, test_set, test_labels, i))
    return k_vals

# plot the k vector to visualize classification results and select optimum k
#from matplotlib import pyplot as plt

#plt.scatter(range(k), k_vals)
#plt.show()

NameError: name 'k' is not defined