In [0]:
#!/usr/bin/python3

import numpy as np
from sklearn.preprocessing import PolynomialFeatures
import glob
import csv
import time
import matplotlib.pyplot as plt

testInputFile = "./testInput.csv"
testTargetFile = "./testTarget.csv"

trainInputFiles = sorted(glob.glob("./trainInput*.csv"))
trainTargetFiles = sorted(glob.glob("./trainTarget*.csv"))

degrees = range(1, 5)
def readInData(file, arrayToPopulate):
    ''' Parameters:
        dataPath: CSV file to extract from
        arrayToPopulate: Array to add values to'''
    with open(file, "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            arrayToPopulate.append(row)
timeArray = []
averageArray = []

def sort(input):
	return input[0]

def cross():
    degreeMax = [] #degreeWeight average
    for d in degrees:
        start = time.time()
        accuracyAverage = []
        for j in range(10):  # 10-fold validation
            testInput = []
            testTarget = []
            trainInput = []
            trainTarget = []

            readInData(trainInputFiles[j], testInput)  # Read in one train data set as test data
            readInData(trainTargetFiles[j], testTarget)  # Read in one train label set as test label
            for l in range(0, j):
                readInData(trainInputFiles[l], trainInput)
                readInData(trainTargetFiles[l], trainTarget)
            for l in range(j + 1, 10):
                readInData(trainInputFiles[l], trainInput)
                readInData(trainTargetFiles[l], trainTarget)
            poly = PolynomialFeatures(d)
            trainInput = poly.fit_transform(trainInput)
            testInput = poly.fit_transform(testInput)
            loss = calculateLoss(trainInput, trainTarget, testInput, testTarget)
            accuracyAverage.append(loss)
        averageArray.append(np.mean(accuracyAverage))
        degreeMax.append([np.mean(accuracyAverage), d])
        end = time.time()
        timeArray.append(end - start)
    bestDegree = min(degreeMax, key=sort)[1]
    print("Best degree:")
    print(bestDegree)
    print("The loss of the best degree on test set:")
    print(regress(bestDegree))

def calculateLoss(trainInput, trainTarget, testInput, testTarget): # Do somethnig with basis functions
    x_bar = []
    for trainInputRow in trainInput:
        x_bar.append(trainInputRow)
    x_bar = np.array(x_bar).astype('float')  # x_bar is actually x_bar transpose from the slides
    A = np.matmul(x_bar.transpose(), x_bar) # Check multiply functions
    b = np.array([np.zeros(len(trainInput[0]))])

    for i in range(len(x_bar)):
        b = b + np.multiply(float(trainTarget[i][0]), x_bar[i])
    b = b.transpose()
    w = np.matmul(np.linalg.inv(np.identity(len(trainInput[0])) + A), b)
    
    loss = 0

    for i in range(len(testTarget)):
        difference = (np.matmul(w.transpose(), np.transpose(testInput[i].astype('float'))) - float(
            testTarget[i][0]))[0]
        loss = loss + (difference * difference)
    loss = 0.5 * loss
    return loss

def regress(bestDegree):
    testInput = []
    testTarget = []
    trainInput = []
    trainTarget = []

    readInData(testInputFile, testInput)
    readInData(testTargetFile, testTarget)
    for inputFile in trainInputFiles:
        readInData(inputFile, trainInput)
    for targetFile in trainTargetFiles:
        readInData(targetFile, trainTarget)
    poly = PolynomialFeatures(bestDegree)
    trainInput = poly.fit_transform(trainInput)
    testInput = poly.fit_transform(testInput)
    return calculateLoss(trainInput, trainTarget, testInput, testTarget)

cross()

plt.plot(degrees, averageArray)
plt.savefig("q2aError.png")