In [1]:
import pandas as pd
import numpy as np
import itertools 
import matplotlib.pyplot as plt

In [2]:
train_in = pd.read_csv("train_in.csv")
train_out = pd.read_csv("train_out.csv")
train_in = np.array(train_in)
train_out = np.array(train_out)

In [4]:
def get_distances(train_in, train_out):
    """Method do get the distance in 256-dimensional pace between digits 0-9.
    
    This method considers 16x16 pixel images as points in 256 dimensional space,
    where each dimension is the brightness of a specific pixel. The method then 
    averages all point of a given digit to obtain an average point for each digit
    (0-9). It then computes the distances in these space between all average points.
    
    Arguments:
        train_in (array): An array of 16x16 images, represented by pixel brightness
        train_out (array): An array of labels indicating which digit (0-9) is represented
                           by the same indexed data point in train_in
    
    Returns:
        distances (array): A 10x10 array representing the distances in 256-dimensional
                           space between all digits 0-9
    """
    clouds = dict()

    for x in range(10):
        clouds[x] = train_in[np.where(train_out == x)[0],:]


    means = dict()
    for x in range(10):
        means[x] =  np.mean(clouds[x], axis=0)


    distances = []
    for i in range(10):
        for j in range(10):
            distances.append(np.linalg.norm(means[i]-means[j]))
    distances = np.array(distances)
    distances = np.reshape(distances,(10,10))
    
    return distances