In [1]:
import csv
import random
import math
import operator
from minist_dataset import generate_training_test, split_into_x_and_y_minist, subsample


def loadDataset(filename, split):
    trainingSet = []
    testSet = []
    with open(filename, 'rb') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        for i in range(len(dataset) - 1):
            for y in range(4):
                dataset[i][y] = float(dataset[i][y])
            if random.random() < split:
                trainingSet.append(dataset[i])
            else:
                testSet.append(dataset[i])

    return trainingSet, testSet


def split_into_x_and_y(data):
    x=map(lambda item:item[:4], data)
    y=map(lambda item: item[4], data)
    return x, y

def euclideanDistance(instance1, instance2):
    distance = 0
    for i in range(len(instance1)):
        distance += pow((instance1[i] - instance2[i]), 2)
    return math.sqrt(distance)

def getAccuracy(y_set, predictions):
    correct = 0
    for i in range(len(y_set)):
        print('predicted = '+str(predictions[i])+'   actual = '+str(y_set[i]))
        if y_set[i] == predictions[i]:
            correct += 1
    return (correct / float(len(y_set))) * 100.0


def getNeighbors(x_train, y_train, x_test_Instance, k):
    distances = []
    for i in range(len(x_train)):
        dist = euclideanDistance(x_test_Instance, x_train[i])
        distances.append((y_train[i], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors


def getResponse(neighbors):
    classVotes = {}
    for response in neighbors:
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]


def predict(x_test, x_train, y_train, k):
    predictions = []
    for i in range(len(x_test)):
        neighbors = getNeighbors(x_train, y_train, x_test[i], k)
        result = getResponse(neighbors)
        predictions.append(result)

    return predictions


def main_minist():
    # prepare data
    split = 0.9999
    trainingSet, testSet = generate_training_test(split)
    #untuk test aja
    subsampledtestset = subsample(trainingSet, 0.002)

#     x_train, y_train=split_into_x_and_y_minist(trainingSet)
    x_train, y_train = split_into_x_and_y_minist(subsampledtestset)
    x_test, y_test = split_into_x_and_y_minist(testSet)

    print('Train set: ' + str(len(trainingSet)))
    print('Test set: ' + str(len(testSet)))
    
    # generate predictions
    k=3
    
    predictions = predict(x_train, x_train, y_train, k)
    accuracy_train = getAccuracy(y_train, predictions)
    print('Accuracy train: ' + str(accuracy_train) + '%')
    
    print('============================')
    
    predictions = predict(x_test, x_train, y_train, k)
    accuracy_test = getAccuracy(y_test, predictions)
    print('Accuracy test: ' + str(accuracy_test) + '%')

main_minist()

len digits 60000
Train set: 59994
Test set: 6
predicted = 5   actual = 5
predicted = 0   actual = 0
predicted = 4   actual = 4
predicted = 1   actual = 1
predicted = 9   actual = 9
predicted = 9   actual = 2
predicted = 1   actual = 1
predicted = 3   actual = 3
predicted = 1   actual = 1
predicted = 4   actual = 4
predicted = 3   actual = 3
predicted = 5   actual = 5
predicted = 3   actual = 3
predicted = 6   actual = 6
predicted = 1   actual = 1
predicted = 7   actual = 7
predicted = 2   actual = 2
predicted = 8   actual = 8
predicted = 6   actual = 6
predicted = 9   actual = 9
predicted = 4   actual = 4
predicted = 0   actual = 0
predicted = 9   actual = 9
predicted = 1   actual = 1
predicted = 1   actual = 1
predicted = 2   actual = 2
predicted = 7   actual = 4
predicted = 3   actual = 3
predicted = 7   actual = 2
predicted = 7   actual = 7
predicted = 3   actual = 3
predicted = 8   actual = 8
predicted = 6   actual = 6
predicted = 9   actual = 9
predicted = 0   actual = 0
predicted