In [0]:
#!/usr/bin/python3

import numpy as np
from sklearn.preprocessing import PolynomialFeatures
import glob
import csv
import time
import matplotlib.pyplot as plt

testInputFile = "./testInput.csv"
testTargetFile = "./testTarget.csv"

trainInputFiles = sorted(glob.glob("./trainInput*.csv"))
trainTargetFiles = sorted(glob.glob("./trainTarget*.csv"))

timeArray = []
averageArray = []
degrees = range(1, 5)
def readInData(file, arrayToPopulate):
    ''' Parameters:
        dataPath: CSV file to extract from
        arrayToPopulate: Array to add values to'''
    with open(file, "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            arrayToPopulate.append(row)

def listToNumpyList(list):
    for entry in list:
        entry = np.array(entry)
    return list

def sort(input):
	return input[0]

def cross():
    degreeMax = [] #degreeWeight average
    for d in degrees:
        start = time.time()
        accuracyAverage = []
        for j in range(10):  # 10-fold validation
            testInput = []
            testTarget = []
            trainInput = []
            trainTarget = []

            readInData(trainInputFiles[j], testInput)  # Read in one train data set as test data
            readInData(trainTargetFiles[j], testTarget)  # Read in one train label set as test label
            for l in range(0, j):
                readInData(trainInputFiles[l], trainInput)
                readInData(trainTargetFiles[l], trainTarget)
            for l in range(j + 1, 10):
                readInData(trainInputFiles[l], trainInput)
                readInData(trainTargetFiles[l], trainTarget)
            poly = PolynomialFeatures(d)
            trainInput = poly.fit_transform(trainInput)
            testInput = poly.fit_transform(testInput)
            bayesianMean = calculateBayesianLoss(trainInput, trainTarget, testInput, testTarget)
            accuracyAverage.append(bayesianMean)
        averageArray.append(np.mean(accuracyAverage))
        degreeMax.append([np.mean(accuracyAverage), d])
        end = time.time()
        timeArray.append(end - start)
    print(degreeMax)
    bestDegree = min(degreeMax, key=sort)[1]
    print("Best degree:")
    print(bestDegree)
    print("The mean of the best degree on test set:")
    print(bayeRegress(bestDegree))

def calculateBayesianLoss(trainInput, trainTarget, testInput, testTarget):
    
    A = np.matmul(trainInput.transpose(), trainInput) + np.identity(len(trainInput[0]))
    
    listToNumpyList(trainTarget)
    trainTarget = np.array(trainTarget)
    w_bar = np.matmul(np.linalg.inv(A), np.matmul(trainInput.transpose(), trainTarget.astype('float'))) #shape is(3,1)
    loss = 0
    for i in range(len(testTarget)):
        bayeMean = np.matmul(testInput[i], w_bar) # testInput is (1,3) and w_bar is (3,1)
        difference = (float)(testTarget[i][0]) - bayeMean
        loss = loss + (difference * difference)
    return 0.5 * loss



def bayeRegress(bestDegree):
    testInput = []
    testTarget = []
    trainInput = []
    trainTarget = []

    readInData(testInputFile, testInput)
    readInData(testTargetFile, testTarget)
    for inputFile in trainInputFiles:
        readInData(inputFile, trainInput)
    for targetFile in trainTargetFiles:
        readInData(targetFile, trainTarget)
    poly = PolynomialFeatures(bestDegree)
    trainInput = poly.fit_transform(trainInput)
    testInput = poly.fit_transform(testInput)
    return calculateBayesianLoss(trainInput, trainTarget, testInput, testTarget)


cross()

plt.plot(degrees, averageArray)
plt.savefig("q2bError.png")