This notebook serves as a sandbox for testing different hyperparameters for training the neural network. While this notebook can  be ran without a Nvidia GPU, the usage of one is reccomended as this notebook was written with using CUDA when running PyTorch in mind.

In [1]:
#Import neccessary libraries
import numpy as np
import pandas as pd
import torch as th

In [47]:
#Read data from dataset.csv and convert into numpy arrays
dataFrame = pd.read_csv("dataset.csv")
dataArray = dataFrame.to_numpy()
labels = dataArray[:, 34].astype(str)
features = np.delete(dataArray, 34, 1)

#Convert strings in labels array into ints and set as type 'int64'
#Additionally, for the purposes of this project, data that has the label 'Enrolled' will be ignored
intLabels = np.empty(3630, dtype = 'float32')
removeList = []
labelIndex = 0
for i in range(4424):
    if(labels[i] == "Dropout"):
        intLabels[labelIndex] = 0
        labelIndex += 1
    elif(labels[i] == "Graduate"):
        intLabels[labelIndex] = 1
        labelIndex += 1
    elif(labels[i] == "Enrolled"):
        removeList.append(i)
labels = intLabels
features = np.delete(features, removeList, 0)

#Clean up features array by removing biased features, one-hot encoding, and standardization
features = np.delete(features, [0, 1, 2, 6, 7, 8, 9, 10, 11, 12, 15, 18], 1)

courseEncode = np.zeros((3630, 17), dtype = 'float32')
qualiEncode = np.zeros((3630, 17), dtype = 'float32')
for i in range(3630):
    course = features[i, 0]
    courseEncode[i, (course - 1)] = 1
    quali = features[i, 2]
    qualiEncode[i, (quali - 1)] = 1
hotFeatures = np.concatenate((courseEncode, qualiEncode), axis = 1)

boolFeatures = np.vstack((features[:, 1], features[:, 3], features[:, 4], features[:, 5])).astype('float32').T
unstdFeatures = np.delete(features, [0, 1, 2, 3, 4, 5], 1).astype('float32')
stdFeatures = np.empty((3630, 16), dtype = 'float32')
for i in range(16):
    mean = np.mean(unstdFeatures[:, i])
    std = np.std(unstdFeatures[:, i])
    for j in range(3630):
        stdFeatures[j, i] = (unstdFeatures[j, i] - mean) / std

features = np.concatenate((stdFeatures, boolFeatures, hotFeatures), axis = 1)

#Dataset class used for creating datasets out of the training and validation data
class Dataset(th.utils.data.Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx, :], self.labels[idx]
    
#Function for creating dataloaders using the training and validation sets made with getSets
#The dataloader passes samples in batches of 66
def buildLoaders(tFeatures, tLabels, vFeatures, vLabels):
    tDataset = Dataset(tFeatures, tLabels)
    vDataset = Dataset(vFeatures, vLabels)
    tLoader = th.utils.data.DataLoader(tDataset, batch_size = 66)
    vLoader = th.utils.data.DataLoader(vDataset, batch_size = 66)
    return tLoader, vLoader

#NeuralNetwork class used for the building the model
#Neural Network sepcific hyperparameters are edited directly in this class
class NeuralNetwork(th.nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = th.nn.Flatten()
        self.fc1 = th.nn.Linear(54 , 27)
        self.drop = th.nn.Dropout(0.2)
        self.fc2 = th.nn.Linear(27, 1)
        
    def forward(self, x):
        x = self.flatten(x)
        x = th.nn.functional.relu(self.fc1(x))
        x = self.drop(x)
        x = self.fc2(x)
        return x
    
#Select the device to be used for training the model and send the model to it
#Uses GPU if a Nvdia GPU is detected and PyTorch was installed with the CUDA platform, uses CPU otherwise
device = "cuda" if th.cuda.is_available() else "cpu"
model = NeuralNetwork().to(device)

#Define a loss function and an optimizer for training the model
lossFunc = th.nn.BCEWithLogitsLoss()
optimizer = th.optim.SGD(model.parameters(), lr = 0.0007, momentum = 0.99, weight_decay = 0.01)

#Define training function
def train(tLoader, model, lossFunc, optimizer):
    size = len(tLoader.dataset)
    model.train()
    for batch, (X, y) in enumerate(tLoader):
        X, y = X.to(device), y.to(device)
        prediction = model(X)
        prediction = prediction.squeeze()
        loss = lossFunc(prediction, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

#Define testing fuction
def test(vLoader, model, lossFunc):
    size = len(vLoader.dataset)
    numBatches = len(vLoader)
    model.eval()
    correct = 0
    with th.no_grad():
        for X, y in vLoader:
            X, y = X.to(device), y.to(device)
            prediction = model(X)
            prediction = prediction.squeeze()
            prediction = th.sigmoid(prediction)
            prediction = (prediction > 0.5).type(th.float)
            correct += (prediction == y).type(th.float).sum().item()
    correct /= size
    print(f"Test Accuracy: {(100*correct):>0.1f}%\n")

#Create datasets for trianing data and validation data
pIndex = np.random.permutation(3630)
vNum = int(3630 * 0.2)
vIndex = pIndex[:vNum]
tIndex = pIndex[vNum:]
tFeatures = features[tIndex]
tLabels = labels[tIndex]
vFeatures = features[vIndex]
vLabels = labels[vIndex]

#Train the model for the given number of epochs and print out the training and testing error
epochs = 20
tLoader, vLoader = buildLoaders(tFeatures, tLabels, vFeatures, vLabels)
for i in range(epochs):
    print(f"Epoch {i+1}\n-------------------------------")
    train(tLoader, model, lossFunc, optimizer)
    test(vLoader, model, lossFunc)

Epoch 1
-------------------------------
Test Accuracy: 80.9%

Epoch 2
-------------------------------
Test Accuracy: 76.0%

Epoch 3
-------------------------------
Test Accuracy: 78.0%

Epoch 4
-------------------------------
Test Accuracy: 81.3%

Epoch 5
-------------------------------
Test Accuracy: 85.7%

Epoch 6
-------------------------------
Test Accuracy: 87.3%

Epoch 7
-------------------------------
Test Accuracy: 87.9%

Epoch 8
-------------------------------
Test Accuracy: 87.7%

Epoch 9
-------------------------------
Test Accuracy: 88.8%

Epoch 10
-------------------------------
Test Accuracy: 89.7%

Epoch 11
-------------------------------
Test Accuracy: 89.3%

Epoch 12
-------------------------------
Test Accuracy: 89.4%

Epoch 13
-------------------------------
Test Accuracy: 89.1%

Epoch 14
-------------------------------
Test Accuracy: 88.8%

Epoch 15
-------------------------------
Test Accuracy: 89.5%

Epoch 16
-------------------------------
Test Accuracy: 90.2%

E

With the current model, validation accuracy is within the range of 90-91% accurate after 20 epochs of training.