In [35]:
import csv
import random
import math
import operator
import numpy as np


def loadDataset(filename, split):
    trainingSet = []
    testSet = []
    with open(filename, 'r') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        next(csvreader)
        for l in csvreader:
            for y in range(3):
                l[y] = int(l[y])
            if random.random() < split:
                trainingSet.append(l)
            else:
                testSet.append(l)

    return trainingSet, testSet

def euclideanDistance(instance1, instance2, length):
    distance = 0
    for i in range(length):
        distance += pow((instance1[i] - instance2[i]), 2)
    return math.sqrt(distance)


def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance) - 1
    for i in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[i], length)
        distances.append((trainingSet[i], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors


def getResponse(neighbors):
    votes = []
    for i in neighbors :
        votes.append(i[-1])
    return np.mean(votes)

def getRMSE(Y, Y_pred):
    rmse = np.sqrt(sum((Y - Y_pred) ** 2) / len(Y))
    return rmse

# Model Evaluation - R2 Score
def getR2(Y, Y_pred):
    mean_y = np.mean(Y)
    ss_tot = sum((Y - mean_y) ** 2)
    ss_res = sum((Y - Y_pred) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

def predict(testSet, trainingSet, k):
    predictions = []
    actuals = []
    for i in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[i], k)
        result = getResponse(neighbors)
        predictions.append(result)
        actuals.append(testSet[i][-1])
#         print('predicted= ' + str(result) +' actual=' + str(testSet[i][-1]))
    return np.array(actuals), np.array(predictions)


def main():
    # prepare data
    split = 0.6
    trainingSet, testSet=loadDataset('../linear_regression/student.csv', split)
    print('Train set: ' + str(len(trainingSet)))
    print('Test set: ' + str(len(testSet)))
    # generate predictions
    k=3
    actuals, predictions = predict(trainingSet, trainingSet,k)
    print(actuals.shape)
    print(predictions.shape)
    
    rmse = getRMSE(actuals,predictions)
    print(rmse)
    r2 = getR2(actuals,predictions)
    print(r2)
    print('============================')
    predictions = predict(testSet, trainingSet,k)
    
    print(rmse)


main()


Train set: 617
Test set: 383
(617,)
(617,)
3.643139488690166
0.9423206207158628
