In [0]:
#!/usr/bin/python3
import csv
import glob
import numpy as np
from scipy import stats
import math
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import sys
np.set_printoptions(threshold=sys.maxsize)

trainDataPath = glob.glob("./trainData*.csv") # Get path for testLabel
trainLabelPath = glob.glob("./trainLabels*.csv")	# Get path for TrainLabels

testDataPath = "./testData.csv"
testLabelPath = "./testLabels.csv"

lambs = np.linspace(1,6, num = 6)
y_values = []

def readInData(file, arrayToPopulate):
    ''' Parameters:
        dataPath: CSV file to extract from
        arrayToPopulate: Array to add values to'''
    with open(file, "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            arrayToPopulate.append(row)

def readInDataWithOne(file, arrayToPopulate):
    ''' Parameters:
        dataPath: CSV file to extract from
        arrayToPopulate: Array to add values to'''
    with open(file, "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            row.insert(0,'1')
            arrayToPopulate.append(row)

def convertToNumpy(dataToChange):
    for i in range(len(dataToChange)):
        vector = list(map(float, dataToChange[i]))
        dataToChange[i] = np.array(vector)

def sigmoid(input):
    if input > 0:
        return 1 / (1 + np.exp(-input))
    elif input < 0:
        return np.exp(input) / (1 + np.exp(input))

def sort(input):
	return input[0]

def calculateAccuracy(trainData, trainLabel, testData, testLabel, lamb):
    w = np.random.uniform(low=-0.01, high=0.01, size=65).reshape(65,1)
    y_n = np.array([])
    for row in trainLabel:
        if (row[0] == 5):
            y_n = np.append(y_n, 1)
        elif (row[0] == 6):
            y_n = np.append(y_n, 0)

    trainData = np.array(trainData)

    for k in range(10):  # Get the optimal weight w
        diagArray = []
        gradient = np.zeros(65).reshape(65,1)
        for i in range(len(trainData)):  # calculate each sigmoid and calculate gradient
            sigmoidValue = sigmoid(np.matmul(w.transpose(), trainData[i]))
            diagArray.append(sigmoidValue * (1 - sigmoidValue))

            sigmoidValueLabel = (sigmoidValue - y_n[i])
            gradient = gradient + (sigmoidValueLabel * trainData[i]).reshape(65, 1) + lamb * w

        R = np.zeros((len(diagArray), len(diagArray)), float)
        #row,col = np.diag_indices(len(diagArray))
        diagArray = np.array(diagArray)
        #R[row,col] = diagArray
        np.fill_diagonal(R, diagArray)
        H = np.matmul(np.matmul(trainData.transpose(), R), trainData) + (lamb * np.identity(65))

        # transpose is actually the none transpose version
        w = w - np.matmul(np.linalg.inv(H), gradient).reshape(65, 1)

    correct = 0

    for testIndex in range(len(testData)):
        result = np.matmul(w.transpose(), testData[testIndex])
        predicted = ""
        label = testLabel[testIndex][0]
        if (result >= 0):
            predicted = 5.0
        elif (result < 0):
            predicted = 6.0
        if (label == predicted): correct += 1
    accuracy = float(correct) / len(testData)
    return accuracy

def regression(lamb):
    testData = []
    testLabel = []
    trainData = []
    trainLabel = []

    readInDataWithOne(testDataPath, testData)
    readInData(testLabelPath, testLabel)
    for dataFile in trainDataPath:
        readInDataWithOne(dataFile, trainData)
    for labelFile in trainLabelPath:
        readInData(labelFile, trainLabel)

    convertToNumpy(trainData)
    convertToNumpy(trainLabel)
    convertToNumpy(testData)
    convertToNumpy(testLabel)
    return calculateAccuracy(testData, testLabel, trainData, trainLabel, lamb)

def cross():
    lambdaArray = []
    for lamb in lambs:
        accuracyArray = []
        for j in range(10):  # 10-fold validation
            testData = []
            testLabel = []
            trainData = []
            trainLabel = []

            readInDataWithOne(trainDataPath[j], testData)  # Read in one train data set as test data
            readInData(trainLabelPath[j], testLabel)  # Read in one train label set as test label
            for l in range(0, j):
                readInDataWithOne(trainDataPath[l], trainData)
                readInData(trainLabelPath[l], trainLabel)
            for l in range(j + 1, 10):
                readInDataWithOne(trainDataPath[l], trainData)
                readInData(trainLabelPath[l], trainLabel)

            convertToNumpy(trainData)
            convertToNumpy(trainLabel)
            convertToNumpy(testData)
            convertToNumpy(testLabel)

            accuracy = calculateAccuracy(trainData, trainLabel, testData, testLabel, lamb)
            accuracyArray.append(accuracy)
        average = np.mean(accuracyArray)
        lambdaArray.append((average, lamb))
        y_values.append(average)
    bestLambda = max(lambdaArray, key=sort)[1]
    f = open("q1bAnswer.txt", "w+")
    f.write("The best lambda is: %f\n" % bestLambda)
    f.write("The accuracy with the best lambda is: %f" % regression(bestLambda))


cross()

plt.plot(lambs, y_values)
plt.savefig("lambda")
plt.show()