This notebook serves as a sandbox for testing different hyperparameters for training the neural network. While this notebook can  be ran without a Nvidia GPU, the usage of one is reccomended as this notebook was written with using CUDA when running PyTorch in mind.

In [8]:
#Import neccessary libraries
import numpy as np
import pandas as pd
import torch as th

In [9]:
#Read data from dataset.csv and convert into numpy arrays
dataFrame = pd.read_csv("dataset.csv")
dataArray = dataFrame.to_numpy()
labels = dataArray[:, 34].astype(str)
features = np.delete(dataArray, 34, 1)

#Clean up features array by removing biased features, one-hot encoding, and standardization
features = np.delete(features, [0, 1, 2, 6, 7, 8, 9, 10, 11, 12, 15, 18], 1)

courseEncode = np.zeros((4424, 17), dtype = 'float64')
qualiEncode = np.zeros((4424, 17), dtype = 'float64')
for i in range(4424):
    course = features[i, 0]
    courseEncode[i, (course - 1)] = 1
    quali = features[i, 2]
    qualiEncode[i, (quali - 1)] = 1
hotFeatures = np.concatenate((courseEncode, qualiEncode), axis = 1)

boolFeatures = np.vstack((features[:, 1], features[:, 3], features[:, 4], features[:, 5])).astype('float64').T
unstdFeatures = np.delete(features, [0, 1, 2, 3, 4, 5], 1).astype('float64')
stdFeatures = np.empty((4424, 16), dtype = 'float64')
for i in range(16):
    mean = np.mean(unstdFeatures[:, i])
    std = np.std(unstdFeatures[:, i])
    for j in range(4424):
        stdFeatures[j, i] = (unstdFeatures[j, i] - mean) / std

features = np.concatenate((stdFeatures, boolFeatures, hotFeatures), axis = 1)

#Function for getting training and validation sets using 8-fold cross validation
#vNumber is the index of the split list of feature arrays to be used for the validation set and must be within the range 0 - 7
def getSets(vNumber, features, labels):
    splitFeatures = np.split(features, 8)
    splitLabels = np.split(labels, 8)
    vFeatures = splitFeatures.pop(vNumber)
    vLabels = splitLabels.pop(vNumber)
    tFeatures = np.concatenate(splitFeatures, axis = 0)
    tLabels = np.concatenate(splitLabels, axis = 0)
    return tFeatures, tLabels, vFeatures, vLabels

#Dataset class used for creating datasets out of the training and validation data
class Dataset(th.utils.data.Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx, :], self.labels[idx]
    
#Function for creating dataloaders using the training and validation sets made with getSets
#The dataloader passes samples in batches of 79 as it is a factor of both 553 and 3871
#which are the number of entries in the validation and training sets respectively
def buildLoaders(tFeatures, tLabels, vFeatures, vLabels):
    tDataset = Dataset(tFeatures, tLabels)
    vDataset = Dataset(vFeatures, vLabels)
    tLoader = th.utils.data.DataLoader(tDataset, batch_size = 79, shuffle = True)
    vLoader = th.utils.data.DataLoader(vDataset, batch_size = 79, shuffle = True)
    