This notebook serves as a sandbox for testing different hyperparameters for training the neural network. While this notebook can  be ran without a Nvidia GPU, the usage of one is reccomended as this notebook was written with using CUDA when running PyTorch in mind.

In [8]:
#Import neccessary libraries
import numpy as np
import pandas as pd
import torch as th

In [21]:
#Read data from dataset.csv and convert into numpy arrays
dataFrame = pd.read_csv("dataset.csv")
dataArray = dataFrame.to_numpy()
labels = dataArray[:, 34].astype(str)
features = np.delete(dataArray, 34, 1)

#Convert strings in labels array into ints and set as type 'int64'
#Additionally, create a list containing the string values respective to the int representations for easy conversion later on
intLabels = np.empty(4424, dtype = 'int64')
for i in range(4424):
    if(labels[i] == "Dropout"):
        intLabels[i] = 0
    elif(labels[i] == "Graduate"):
        intLabels[i] = 1
    elif(labels[i] == "Enrolled"):
        intLabels[i] = 2
labels = intLabels
labelsRefrence =["Dropout", "Graduate", "Enrolled"]

#Clean up features array by removing biased features, one-hot encoding, and standardization
features = np.delete(features, [0, 1, 2, 6, 7, 8, 9, 10, 11, 12, 15, 18], 1)

courseEncode = np.zeros((4424, 17), dtype = 'float32')
qualiEncode = np.zeros((4424, 17), dtype = 'float32')
for i in range(4424):
    course = features[i, 0]
    courseEncode[i, (course - 1)] = 1
    quali = features[i, 2]
    qualiEncode[i, (quali - 1)] = 1
hotFeatures = np.concatenate((courseEncode, qualiEncode), axis = 1)

boolFeatures = np.vstack((features[:, 1], features[:, 3], features[:, 4], features[:, 5])).astype('float32').T
unstdFeatures = np.delete(features, [0, 1, 2, 3, 4, 5], 1).astype('float32')
stdFeatures = np.empty((4424, 16), dtype = 'float32')
for i in range(16):
    mean = np.mean(unstdFeatures[:, i])
    std = np.std(unstdFeatures[:, i])
    for j in range(4424):
        stdFeatures[j, i] = (unstdFeatures[j, i] - mean) / std

features = np.concatenate((stdFeatures, boolFeatures, hotFeatures), axis = 1)

#Function for getting training and validation sets using 8-fold cross validation
#vNumber is the index of the split list of feature arrays to be used for the validation set and must be within the range 0 - 7
def getSets(vNumber, features, labels):
    splitFeatures = np.split(features, 8)
    splitLabels = np.split(labels, 8)
    vFeatures = splitFeatures.pop(vNumber)
    vLabels = splitLabels.pop(vNumber)
    tFeatures = np.concatenate(splitFeatures, axis = 0)
    tLabels = np.concatenate(splitLabels, axis = 0)
    return tFeatures, tLabels, vFeatures, vLabels

#Dataset class used for creating datasets out of the training and validation data
class Dataset(th.utils.data.Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx, :], self.labels[idx]
    
#Function for creating dataloaders using the training and validation sets made with getSets
#The dataloader passes samples in batches of 79 as it is a factor of both 553 and 3871
#which are the number of entries in the validation and training sets respectively
def buildLoaders(tFeatures, tLabels, vFeatures, vLabels):
    tDataset = Dataset(tFeatures, tLabels)
    vDataset = Dataset(vFeatures, vLabels)
    tLoader = th.utils.data.DataLoader(tDataset, batch_size = 79, shuffle = True)
    vLoader = th.utils.data.DataLoader(vDataset, batch_size = 79, shuffle = True)
    return tLoader, vLoader

#NeuralNetwork class used for the building the model
#Neural Network sepcific hyperparameters are edited directly in this class
class NeuralNetwork(th.nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = th.nn.Flatten()
        self.linearReLUStack = th.nn.Sequential(
            th.nn.Linear(54, 28),
            th.nn.ReLU(),
            th.nn.Linear(28, 3)
        )
        
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linearReLUStack(x)
        return logits
    
#Select the device to be used for training the model and send the model to it
#Uses GPU if a Nvdia GPU is detected and PyTorch was installed with the CUDA platform, uses CPU otherwise
device = "cuda" if th.cuda.is_available() else "cpu"
model = NeuralNetwork().to(device)

#Define a loss function and an optimizer for training the model
lossFunc = th.nn.CrossEntropyLoss()
optimizer = th.optim.Adam(model.parameters(), lr = 0.0005)

#Define training function
#Returns the average training error for the given training set
def train(tLoader, model, lossFunc, optimizer):
    tError = 0
    numBatches = len(tLoader)
    size = len(tLoader.dataset)
    model.train()
    for batch, (X, y) in enumerate(tLoader):
        X, y = X.to(device), y.to(device)
        prediction = model(X)
        loss = lossFunc(prediction, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch % 100 == 0:
            tError = tError + loss.item()
    return tError / numBatches

#Define testing fuction
#Returns the average testing error for the given training set
def test(vLoader, model, lossFunc):
    size = len(vLoader.dataset)
    numBatches = len(vLoader)
    model.eval()
    vError = 0
    with th.no_grad():
        for X, y in vLoader:
            X, y = X.to(device), y.to(device)
            prediction = model(X)
            vError = vError + lossFunc(prediction, y).item()
    return vError / numBatches

#Train the model for the given number of epochs using 8-fold cross-validation and print out the average training and testing
#error for each epoch across all validation sets
epochs = 10
results = np.empty((epochs * 2, 8), dtype = 'float32')
resultIndex = 0
for i in range(8):
    tFeatures, tLabels, vFeatures, vLabels = getSets(i, features, labels)
    tLoader, vLoader = buildLoaders(tFeatures, tLabels, vFeatures, vLabels)
    for j in range(epochs):
        results[resultIndex, i] = train(tLoader, model, lossFunc, optimizer)
        resultIndex += 1
        results[resultIndex, i] = test(vLoader, model, lossFunc)
        resultIndex += 1
    resultIndex = 0
for i in range(epochs):
    print("Epoch " + str(i + 1))
    print("-------------------------------")
    print("Average Training Error: " + str(np.mean(results[resultIndex, :])))
    resultIndex += 1
    print("Average Testing Error: " + str(np.mean(results[resultIndex, :])))
    resultIndex += 1
    print()

Epoch 0
-------------------------------
Average Training Error: 0.011634968
Average Testing Error: 0.5871174

Epoch 1
-------------------------------
Average Training Error: 0.011492988
Average Testing Error: 0.5710863

Epoch 2
-------------------------------
Average Training Error: 0.012091221
Average Testing Error: 0.55841196

Epoch 3
-------------------------------
Average Training Error: 0.01205002
Average Testing Error: 0.55124253

Epoch 4
-------------------------------
Average Training Error: 0.0126311015
Average Testing Error: 0.54670984

Epoch 5
-------------------------------
Average Training Error: 0.0112064425
Average Testing Error: 0.5443841

Epoch 6
-------------------------------
Average Training Error: 0.011708992
Average Testing Error: 0.5415697

Epoch 7
-------------------------------
Average Training Error: 0.010882707
Average Testing Error: 0.5406134

Epoch 8
-------------------------------
Average Training Error: 0.01014842
Average Testing Error: 0.53949565

Epoch 