In [None]:
import numpy as np
import pandas as pd
import itertools
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

In [None]:
train_in = np.array(pd.read_csv("train_in.csv"))
train_out = np.array(pd.read_csv("train_out.csv"))
test_in = np.array(pd.read_csv("test_in.csv"))
test_out = np.array(pd.read_csv("test_out.csv"))

In [None]:
def generate_point(pixel_data, label_set):
    """Calculates the "average" (brightness at each pixel-wise) of a set of images
    """
    cloud_sum = np.zeros(256)
    for label in label_set:
        cloud_sum += pixel_data[label]
    cd = cloud_sum/len(label_set)
    return cd

def distance(point1, point2):
    return np.linalg.norm(point1-point2)

In [None]:
"""Splitting up the labeled (train) set into separate label sets for each digit
"""
labels0 = []
labels1 = []
labels2 = []
labels3 = []
labels4 = []
labels5 = []
labels6 = []
labels7 = []
labels8 = []
labels9 = []

for i, label in enumerate(train_out):
    for content in label:
        if content == 0:
            labels0.append(i)
        elif content == 1:
            labels1.append(i)
        elif content == 2:
            labels2.append(i)
        elif content == 3:
            labels3.append(i)
        elif content == 4:
            labels4.append(i)
        elif content == 5:
            labels5.append(i)
        elif content == 6:
            labels6.append(i)
        elif content == 7:
            labels7.append(i)
        elif content == 8:
            labels8.append(i)
        elif content == 9:
            labels9.append(i)
            
all_labels = [labels0, labels1, labels2, labels3, labels4, labels5, labels6, labels7, labels8, labels9]

In [None]:
"""Calculates the average of the sets for each digit
"""

average_points = []
distances = []
for labelset in all_labels:
    point = generate_point(train_in, labelset)
    average_points.append(point)

In [None]:
"""Make predictions using the given averages, classifying each piece of
test data as the digit it is closest to (in 256-dimensional brightness-space)
"""


predictions = np.zeros(999)

for j, test in enumerate(test_in):
    digit_dists = np.zeros(10)
    for i, point in enumerate(average_points):
        dist = distance(test, point)
        digit_dists[i] = dist
    prediction = int(np.argmin(digit_dists))
    predictions[j] = prediction

In [None]:
"""Obtain the accuracy of said prediction
"""

corr_p = 0
print(len(test_out))
for i, predict in enumerate(predictions):
    if predict == np.squeeze(test_out)[i]:
        corr_p += 1
        
print(corr_p)

In [None]:
"""Does k nearest neightbor classification using
the built in sklearn package
"""

KNN = KNeighborsClassifier(12)
KNN.fit(train_in, train_out.ravel())
KNN_predictions = KNN.predict(test_in)

corr_p_knn = 0
for i, predict in enumerate(KNN_predictions):
    if predict == np.squeeze(test_out)[i]:
        corr_p_knn += 1

print(corr_p_knn)

In [None]:
"""Obtain the distances (in 256-dim brightness space) matrix between each digit
"""

point_edges = itertools.product(average_points, repeat=2)

for x,y in point_edges:
    distance0 = distance(x,y)
    distances.append(distance0)
    
distance_array = np.array(distances)
distances = np.reshape(distance_array,(10,10))
    
print(distance_array)
print(np.argmin(distance_array))

In [None]:
print(train_in[51][218])