In [1]:
# load libraries
import tqdm
import os
import math
import pandas as pd
import numpy as np
import random
import time
import itertools
import torch
import torch.nn as nn

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f7a2acbb690>

In [2]:
# load data
data = pd.read_pickle('data/full.pkl')

# filter out useless columns
data = data.drop(columns=['W', 'YearTerm'])

# filter out missing fields
data = data.dropna()

# compute GPA of each course
data['Student Number'] = data['A+'] + data['A'] + data['A-'] + data['B+'] + data['B'] + data['B-'] + data['C+'] + data['C'] + data['C-'] + data['D+'] + data['D'] + data['D-'] + data['F']
grade_mapping = {
    'A+': 4.0,
    'A': 4.0,
    'A-': 3.7,
    'B+': 3.3,
    'B': 3.0,
    'B-': 2.7,
    'C+': 2.3,
    'C': 2.0,
    'C-': 1.7,
    'D+': 1.3,
    'D': 1.0,
    'D-': 0.7,
    'F': 0.0
}
data['GPA'] = 0
for col in ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F']:
    data['GPA'] += grade_mapping[col] * data[col]
data['GPA'] /= data['Student Number']

# convert student number into percentage
for col in ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F']:
    data[col] = data[col] / data['Student Number']

group_columns = ['Number', 'Course Title', 'Subject', 'Primary Instructor', 'Sched Type']
feature_columns = ['Year', 'Term']
# label_columns = ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F', 'GPA']
label_columns = ['GPA']

# convert term into integer
data["Term"] = data["Term"].map({"Winter": 0, "Spring": 1, "Summer": 2, "Fall": 3})

# data sort
data = data.sort_values(by=['Year', 'Term'], ascending=True)

# encoding feature columns: one-hot encoding
feature_encoded = pd.get_dummies(data[feature_columns].astype(str))
feature_columns = feature_encoded.columns
print("feature_encoded:", feature_encoded.columns, feature_encoded.shape)
data = pd.concat([data[group_columns], feature_encoded, data[label_columns]], axis=1)
print("data:", data.shape)

data

# split dataset by Number, Course Title, and Primary Instructor into groups
groups = data.groupby(group_columns)
group_dict = {}
for (number, title, subject, instructor, sched_type), group in groups:
    if group.shape[0] < 4: # filter series with less than 4 records
        continue
    # print(number, title, subject, instructor, sched_type, group.shape)
    group_dict[(number, title, subject, instructor, sched_type)] = group
print("# groups:", len(group_dict))

# generate data for NN
course_ids = []
seqs = []
lens = []
tgts = []
for (key, group), i in zip(group_dict.items(), range(len(group_dict))):
    course_ids.append(i)
    seqs.append(group[feature_columns].to_numpy())
    lens.append(group.shape[0])
    tgts.append(group[label_columns].to_numpy())

# split train/test
## each seq: [:-1] for train, and all for test
seqs_train = []
lens_train = []
tgts_train = []
seqs_test = []
lens_test = []
tgts_test = []
for seq, l, tgt in zip(seqs, lens, tgts):
    seqs_train.append(seq[:-1])
    lens_train.append(l-1)
    tgts_train.append(tgt[:-1])
    seqs_test.append(seq)
    lens_test.append(l)
    tgts_test.append(tgt)

# to torch tensor
from torch.nn.utils.rnn import pad_sequence
course_ids = torch.tensor(course_ids).to(torch.int64)
seqs_train = pad_sequence([torch.tensor(seq) for seq in seqs_train], batch_first=True).to(torch.float32)
lens_train = torch.tensor(lens_train).to(torch.float32)
tgts_train = pad_sequence([torch.tensor(tgt) for tgt in tgts_train], batch_first=True).to(torch.float32)
seqs_test = pad_sequence([torch.tensor(seq) for seq in seqs_test], batch_first=True).to(torch.float32)
lens_test = torch.tensor(lens_test).to(torch.float32)
tgts_test = pad_sequence([torch.tensor(tgt) for tgt in tgts_test], batch_first=True).to(torch.float32)
print("train:", seqs_train.shape, lens_train.shape, tgts_train.shape, course_ids.shape)
print("test:", seqs_test.shape, lens_test.shape, tgts_test.shape, course_ids.shape)

feature_encoded: Index(['Year_2010', 'Year_2011', 'Year_2012', 'Year_2013', 'Year_2014',
       'Year_2015', 'Year_2016', 'Year_2018', 'Year_2019', 'Year_2020',
       'Year_2021', 'Year_2022', 'Year_2023', 'Year_2024', 'Term_0', 'Term_1',
       'Term_2', 'Term_3'],
      dtype='object') (58915, 18)
data: (58915, 24)
# groups: 4048
train: torch.Size([4048, 334, 18]) torch.Size([4048]) torch.Size([4048, 334, 1]) torch.Size([4048])
test: torch.Size([4048, 335, 18]) torch.Size([4048]) torch.Size([4048, 335, 1]) torch.Size([4048])


In [3]:

# training functions

class GPAPredictRNN(nn.Module):
    def __init__(self, input_size, output_size, course_vocab_size, embedding_dim, hidden_size, num_layers, bias, dropout):
        super(GPAPredictRNN, self).__init__()
        self.embedding = nn.Embedding(course_vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=input_size + embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bias = bias,
            dropout=dropout,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_size, output_size)
        self.tanh = nn.Tanh()

    def forward(self, seq, course_id):
        embedding = self.embedding(course_id)
        embedding = torch.stack([embedding] * seq.shape[1], dim=1)
        input_ = torch.concat([seq, embedding], dim=-1)
        out, _ = self.rnn(input_)
        out = self.fc(out)
        gpa = 4.0 * self.tanh(out)
        return gpa

class GPADataset(torch.utils.data.Dataset):
    def __init__(self, seqs, tgts, lens, course_ids, device, in_test=False):
        self.seqs = seqs
        self.tgts = tgts
        self.lens = lens
        self.course_ids = course_ids
        self.device = device
        self.in_test = in_test

    def __len__(self):
        return self.seqs.shape[0]

    def __getitem__(self, idx):
        seq = self.seqs[idx].to(self.device)
        tgt = self.tgts[idx].to(self.device)
        course_id = self.course_ids[idx].to(self.device)
        len_ = int(self.lens[idx])
        mask = torch.zeros_like(seq[..., 0])
        if self.in_test:
            mask[len_ - 1] = 1
        else:
            mask[:len_] = 1
        return seq, tgt, mask, course_id

def loss_fn(tgt_pred, tgt_gt, mask):
    tgt_pred = tgt_pred * mask[..., None]
    tgt_gt = tgt_gt * mask[..., None]
    loss = (tgt_pred[..., -1] - tgt_gt[..., -1]) ** 2  # only for GPA
    loss = loss.sum() / mask.sum()
    return loss

def get_model(input_size, output_size, course_vocab_size, embedding_dim, hidden_size, num_layers, bias, dropout):
    model = GPAPredictRNN(
        input_size=input_size,
        output_size=output_size,
        course_vocab_size=course_vocab_size,
        embedding_dim=embedding_dim,
        hidden_size=hidden_size,
        num_layers=num_layers,
        bias=bias,
        dropout=dropout,
    )
    return model

In [4]:
# load ckpt
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_config = {
    "input_size": seqs_train.shape[-1],
    "output_size": tgts_train.shape[-1],
    "course_vocab_size": len(course_ids),
    "embedding_dim": 64,
    "hidden_size": 64,
    "num_layers": 10,
    "bias": True,
    "dropout": 0.2,
}
model = get_model(**model_config)
model.load_state_dict(torch.load('models/model-embedding_dim=64,hidden_size=64,num_layers=10,bias=True,dropout=0.2,optimizer=Adam,learning_rate=1e-05,batch_size=32.pt', map_location='cpu', weights_only=True))
model.eval()
model = model.to(device)

In [None]:
def compute_loss(model, dataloader):
    loss_total = 0
    num_seqs = 0
    with torch.no_grad():
        for seqs_batch, tgts_batch, mask_batch, course_id_batch in dataloader:
            tgts_pred = model(seqs_batch, course_id_batch)
            loss = loss_fn(tgts_pred, tgts_batch, mask_batch)
            num_seqs += seqs_batch.shape[0]
            loss_total += loss.item() * seqs_batch.shape[0]
    val_loss = loss_total / num_seqs
    return val_loss

def shuffle_tensor(tensor, dim):
    idx = torch.randperm(tensor.shape[dim], device=tensor.device)
    tensor = torch.index_select(tensor, dim, idx)
    return tensor

losses = []
num_permutation = 10

seqs_test = seqs_test.to(device=device)
course_ids = course_ids.to(device=device)
tgts_test = tgts_test.to(device=device)
masks = torch.ones_like(seqs_test[..., 0], device=device)
for i in range(seqs_test.shape[0]):
    masks[i, int(lens_test[i]) - 1] = 1

# feature importance

## baseline
dataset = GPADataset(seqs_test, tgts_test, lens_test, course_ids, device, in_test = True)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=512)
loss_baseline = compute_loss(model, dataloader)

## input feature dimension: 18+1
for dim in range(seqs_test.shape[-1] + 1):
    # print("dim", dim)
    loss = 0
    for _ in range(num_permutation):
        if dim == seqs_test.shape[-1]:
            seqs_input = seqs_test
            course_ids_input = shuffle_tensor(course_ids, 0)
        else:
            seqs_input = seqs_test.clone()
            seqs_input[..., dim] = shuffle_tensor(seqs_test[..., dim], 0)
            course_ids_input = course_ids
        dataset = GPADataset(seqs_input, tgts_test, lens_test, course_ids_input, device, in_test = True)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=512)
        loss += compute_loss(model, dataloader)
    losses.append(float(loss) / num_permutation)

# print feature importance
print("loss_baseline:", loss_baseline)
for i, loss in enumerate(losses):
    if i == seqs_test.shape[-1]:
        print("permuted course_id:", loss)
    else:
        feature_name = feature_columns[i]
        print(f"permuted feature dim {i} ({feature_name})", loss)

loss_baseline: 0.050872107269242406
permuted feature dim 0 (Year_2010) 0.05103168191853911
permuted feature dim 1 (Year_2011) 0.05092426899354905
permuted feature dim 2 (Year_2012) 0.05105029370170087
permuted feature dim 3 (Year_2013) 0.050909606460481885
permuted feature dim 4 (Year_2014) 0.050954789575189355
permuted feature dim 5 (Year_2015) 0.05087237821426242
permuted feature dim 6 (Year_2016) 0.05101152113638818
permuted feature dim 7 (Year_2018) 0.050962435896508396
permuted feature dim 8 (Year_2019) 0.052149775857105854
permuted feature dim 9 (Year_2020) 0.05149591697845608
permuted feature dim 10 (Year_2021) 0.05224884555209428
permuted feature dim 11 (Year_2022) 0.05247324642259628
permuted feature dim 12 (Year_2023) 0.052360879234038295
permuted feature dim 13 (Year_2024) 0.050871631945483387
permuted feature dim 14 (Term_0) 0.05089383579324931
permuted feature dim 15 (Term_1) 0.05105609730817377
permuted feature dim 16 (Term_2) 0.05091075110249221
permuted feature dim 17 (

# Conclusion

The most important feature is the course id, and other features have much less importance.