In [1]:
import gc

# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
gc.collect()

0

In [2]:
import numpy as np
from pathlib import Path
import pandas as pd

import torch
from torch import nn

from data_prep import clean_data, dataset
from data_prep.utils import shuffle_by_timestep, shuffle_non_padded, masked_mae, compute_iou
from models import transformer
from train_test import train_test

In [3]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


# Data loading

In [4]:
df = clean_data.load_data()
df = clean_data.clean_data(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['COURSE'] = df['SUBJECT'] + ' ' + df['COURSE_NUMBER']


Mappings used to identify courses from ids

In [5]:
course_id_map, id_course_map = clean_data.course_id_maps(df)

In [6]:
major_id_map, id_major_map = clean_data.major_id_maps(df)

In [7]:
PAD_IDX = course_id_map["<PAD>"]
SOS_IDX = course_id_map["<SOS>"]
EOS_IDX = course_id_map["<EOS>"]

Load tensors

In [8]:
student_seqs, student_tensors, semester_tensors, major_tensors, grade_tensors = dataset.course_semester_tensors(df)

In [9]:
n_students = len(student_seqs)
n_courses = len(course_id_map)
n_semesters = df['SEMESTER_RANK'].nunique()
n_majors = len(major_id_map) 

print(f'{n_students} students, {n_courses} courses, {n_majors} majors, {n_semesters} "semesters"')

5326 students, 360 courses, 48 majors, 18 "semesters"


In [10]:
student_tensor_sizes = [x.size()[0] for x in student_tensors]
print(f'Min courses: {min(student_tensor_sizes)}, Max courses: {max(student_tensor_sizes)}')

Min courses: 7, Max courses: 58


Load dataloaders

In [11]:
train_dataloader, test_dataloader = dataset.course_seq_dataloader(student_tensors, semester_tensors, major_tensors, grade_tensors, n_courses, SOS_IDX, EOS_IDX, PAD_IDX, batch_size=32)

# Model loading

In [12]:
config = dict()
config['d_model'] = 256
config['num_encoder_layers'] = 4
config['num_decoder_layers'] = 4
config['major_embedding_dim'] = 4
config['nhead'] = 4

# Given parameters defined in config, function to find all folders in model_fits with the same parameters
# Recall that the folder contains a file named config.csv which contains the parameters used to train the model
# with the same parameter names, so scan this file to find if this model fits the parameters above.
def find_model_fits(config):
    model_fits = Path('model_fits')
    fits = []
    for fit in model_fits.iterdir():
        if fit.is_dir():
            config_file = fit / 'config.csv'
            if config_file.exists():
                with open(config_file, 'r') as f:
                    folder_config = pd.read_csv(f).to_dict(orient='records')[0]
                    if all([(key in folder_config) & (folder_config[key] == config[key]) for key in config]):
                        fits.append(fit)
    return fits

In [13]:
fit_dir = find_model_fits(config)[0]

In [14]:
# Function to return config from a fit directory, and load the model
def load_model(fit_dir):
    config_file = fit_dir / 'config.csv'
    with open(config_file, 'r') as f:
        config = pd.read_csv(f).to_dict(orient='records')[0]
    model = transformer.TransformerModelWithGrades(n_courses=n_courses,
                                                    n_majors=n_majors,
                                                    max_len=100,
                                                    config=config,
                                                    PAD_IDX=PAD_IDX).to(device)
    
    # Load the state dict
    state_dict = torch.load(fit_dir / 'model.pyt')

    # Remove 'module.' prefix from keys
    state_dict = {key.replace("module.", ""): value for key, value in state_dict.items()}

    # Load the model state dict from model.pyt in directory
    model.load_state_dict(state_dict)
    return model, config

In [15]:
model, config = load_model(fit_dir)

## Loss functions

In [16]:
course_loss_fn = nn.NLLLoss(reduction='mean')
gpa_loss_fn = nn.MSELoss(reduction='none')

# Load a single student

In [17]:
# student_tensors, semester_tensors, major_tensors, grade_tensors
semester_tensors[0]

tensor([1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4])

In [52]:
def find_student(min_semesters=4, min_course_per_semester=3, major=None, num_semesters_trim=1):
    for i in range(len(student_tensors)):
        if max(semester_tensors[i]) >= min_semesters:
            if major is None or major_tensors[i] == major:
                student_df = pd.DataFrame({'courses': [id_course_map[x.item()] for x in student_tensors[i]],
                            'semesters': semester_tensors[i].tolist(),
                            'grades': [4.3 * x for x in grade_tensors[i].tolist()]})
                min_courses = student_df.groupby('semesters').count().min().values[0]
                if min_courses >= min_course_per_semester:
                    # Find index where last semester starts
                    idx = semester_tensors[i].max(0).indices.item()
                    src_student_tensors = student_tensors[i][:idx+1].unsqueeze(0)
                    tgt_student_tensors = student_tensors[i][idx:].unsqueeze(0)
                    src_semester_tensors = semester_tensors[i][:idx+1].unsqueeze(0)
                    tgt_semester_tensors = semester_tensors[i][idx:].unsqueeze(0)
                    src_grade_tensors = grade_tensors[i][:idx+1].unsqueeze(0)
                    tgt_grade_tensors = grade_tensors[i][idx:].unsqueeze(0)
                    major = major_tensors[i]
                    return (src_student_tensors, tgt_student_tensors), (src_semester_tensors, tgt_semester_tensors), (src_grade_tensors, tgt_grade_tensors), major, student_df

In [57]:
(src_sample_courses, tgt_sample_courses), (src_sample_semesters, tgt_sample_semesters), (src_sample_grades, tgt_sample_grades), sample_major, student_df = find_student(8)

print(f'Major: {id_major_map[sample_major.item()]}')

Major: Biochemistry


In [58]:
student_df.head()

Unnamed: 0,courses,semesters,grades
0,COMM 1317,1,0.0
1,PHIL 1301,1,3.0
2,CHEM 2123,1,4.0
3,CHEM 2323,1,3.0
4,<OTHER>,1,3.0


In [60]:
pred_courses, pred_gpas = train_test.predict(device, model, sample_major, src_sample_courses, src_sample_grades, src_sample_semesters, SOS_IDX, PAD_IDX)



In [69]:
pred_courses_names = [id_course_map[x.item()] for x in pred_courses[0] if x not in [PAD_IDX, SOS_IDX, EOS_IDX]]
pred_grades = [max(x.item() * 4.3, 0) for x in pred_gpas[0]]
tgt_courses_names = [id_course_map[x.item()] for x in tgt_sample_courses[0] if x not in [PAD_IDX, SOS_IDX, EOS_IDX]]

In [70]:
pred_courses_names, tgt_courses_names

(['<OTHER>', 'COSC 3327', 'CAPS 4360'], ['RELS 2323', 'COSC 3337', '<OTHER>'])

# Make predictions

In [15]:
def fit(config, dir, id_course_map=None):
    model = transformer.TransformerModelWithGrades(n_courses=n_courses,
                                                    n_majors=n_majors,
                                                    max_len=100,
                                                    config=config,
                                                    PAD_IDX=PAD_IDX).to(device)

    model = nn.DataParallel(model, device_ids=[0, 1])

    optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])

    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=5)

    epochs = config['epochs']
    train_losses = []
    test_losses = []
    test_ious = []

    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}\n-------------------------------')
        train_loss = train_test.train(device, train_dataloader, model, course_loss_fn, gpa_loss_fn, optimizer, scheduler, config, EOS_IDX, PAD_IDX)
        test_loss, test_iou = train_test.test(device, test_dataloader, model, course_loss_fn, gpa_loss_fn, config, EOS_IDX, PAD_IDX, id_course_map)

        train_losses.append(train_loss)
        test_losses.append(test_loss)
        test_ious.append(test_iou)
    
    # Create the directory dir if it does not already exist
    Path(f'model_fits/{dir}').mkdir(parents=True, exist_ok=True)

    # Save config object as a DataFrame
    config_df = pd.DataFrame(config, index=[0])
    config_df.to_csv(f'model_fits/{dir}/config.csv', index=False)
        
    # Save the model state dict
    torch.save(model.state_dict(), f'model_fits/{dir}/model.pyt')

    # Also save the train and test losses as a single CSV file
    losses = pd.DataFrame({'train_loss': train_losses, 'test_loss': test_losses})
    losses.to_csv(f'model_fits/{dir}/losses.csv', index=False)

    # Save the test IOU
    ious = pd.DataFrame({'test_iou': test_ious})
    ious.to_csv(f'model_fits/{dir}/ious.csv', index=False)



In [16]:
def read_configs(dir='model_fits'):
    config_files = list(Path(dir).rglob('config.csv'))
    configs = []
    for file in config_files:
        config = pd.read_csv(file).iloc[0].to_dict()
        configs.append(config)
    return configs

In [17]:
for lr in [1e-4, 1e-3]:
    for num_encoder_layers in [4]:
        for num_decoder_layers in [4, 6]:
            for major_embedding_dim in [2**2, 2**3, 2**4, 2**5]:
                for course_embedding_dim in [2**7, 2**8]:
                    for gpa_embedding_dim in [2**3, 2**4]:
                        for nhead in [4, 8]:
                            for d_model in [2**8]:
                                config['lr'] = lr
                                config['num_encoder_layers'] = num_encoder_layers
                                config['num_decoder_layers'] = num_decoder_layers
                                config['major_embedding_dim'] = major_embedding_dim
                                config['course_embedding_dim'] = course_embedding_dim
                                config['gpa_embedding_dim'] = gpa_embedding_dim
                                config['nhead'] = nhead
                                config['d_model'] = d_model
                                config['epochs'] = 30

                                config['dim_feedforward'] = 2 * config['d_model']

                                all_configs = read_configs()
                                if config in all_configs:
                                    continue

                                print(config)

                                fit(config, f'lr_{lr}_enc_{num_encoder_layers}_dec_{num_decoder_layers}_maj_{major_embedding_dim}_cou_{course_embedding_dim}_nhead_{nhead}_dmodel_{d_model}', id_course_map)


{'d_model': 256, 'nhead': 8, 'num_encoder_layers': 4, 'num_decoder_layers': 4, 'dim_feedforward': 512, 'major_embedding_dim': 4, 'course_embedding_dim': 128, 'gpa_embedding_dim': 8, 'dropout': 0.1, 'lr': 0.0001, 'weight_decay': 0.1, 'course_loss_weight': 1, 'epochs': 30}
Epoch 1
-------------------------------


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


loss: 7.267307 [0.1%] (83.1% course, 16.9% gpa)
loss: 2.458854 [22.9%] (95.1% course, 4.9% gpa)
loss: 2.309185 [45.7%] (94.2% course, 5.8% gpa)
loss: 2.045460 [68.5%] (94.6% course, 5.4% gpa)
loss: 2.025536 [91.4%] (94.9% course, 5.1% gpa)
LR = 1.00E-04
Sample
	Predicted: ['<OTHER>', 'COMM 1317', 'BIOL 1307', 'CHEM 1340', 'BIOL 1307', 'MATH 2312']
	Actual: ['BIOL 1107', 'MATH 2413', 'CHEM 1140', 'CHEM 1340', 'BIOL 1307', 'FSEM 1409']
Test loss: 1.570552
Mean IOU: 29.6%

Epoch 2
-------------------------------
loss: 1.945214 [0.1%] (94.4% course, 5.6% gpa)
loss: 1.943330 [22.9%] (94.9% course, 5.1% gpa)
loss: 1.730824 [45.7%] (93.9% course, 6.1% gpa)
loss: 1.798135 [68.5%] (94.9% course, 5.1% gpa)
loss: 1.784542 [91.4%] (94.8% course, 5.2% gpa)
LR = 9.05E-05
Sample
	Predicted: ['<OTHER>', 'COMM 1317', 'MKTG 2301', 'FSTY 1310', 'COSC 1323']
	Actual: ['<OTHER>', 'WRIT 1301', 'MATH 2312', 'PSYC 2301', 'COSC 1123', 'COSC 1323']
Test loss: 1.355692
Mean IOU: 33.1%

Epoch 3
------------------



loss: 6.278923 [0.1%] (95.4% course, 4.6% gpa)
loss: 2.373814 [22.9%] (95.9% course, 4.1% gpa)


KeyboardInterrupt: 