In [3]:
import pandas as pd
import numpy as np
import glob

In [4]:
import matplotlib.pyplot as plt

# Load data

In [5]:
weekly_activity_df = pd.read_pickle('weekly_activity_format.pkl')

X = np.array(list(weekly_activity_df['X']))
y = np.array(weekly_activity_df['y'])
course_modules = list(weekly_activity_df['course_modules'])
results = list(weekly_activity_df['results'])

## Create dataloader

In [15]:
from torch import nn
import torch
import torch.optim as optim
import torch.nn.functional as F

In [16]:
# check if gpu is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [18]:
from torch.utils.data import DataLoader, Dataset

In [19]:
class oversampdata(Dataset):
    def __init__(self, x_data, targets):
        self.data = x_data
        self.targets = targets

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        target = self.targets[index]
        data_val = self.data[index]
        
        return data_val, target


## Make and train pytorch model

In [20]:
class RNN(nn.Module): 
    def __init__(self, input_size, hidden_size, output_size, n_layers=1): 
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.lstm = nn.GRU(input_size, hidden_size, n_layers, bidirectional = True)
        self.fc1 = nn.Linear(hidden_size*2*20, 16)
        self.fc2 = nn.Linear(16, output_size)
        
    def forward(self, inp): 
        output, hidden = self.lstm(inp)
        
        x = self.fc1(output.view(output.size(0), -1))
        x = F.relu(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        
        return x

In [21]:
def binary_acc(y_pred, y_test):
    y_pred = y_pred.cpu()
    y_test = y_test.cpu()
    y_pred_tag = torch.round(y_pred.squeeze(1))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    #acc = torch.round(acc * 100)
    
    return acc

# Leave One Out Prediction for all Courses

In [59]:
def loop_course_testing(X_train, X_test, y_train, y_test): 
    # make validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state = 3)

    # make datasets into torch tensors
    X_train, y_train = torch.from_numpy(X_train).float(), torch.tensor(y_train).float()
    X_test, y_test = torch.from_numpy(X_test).float(), torch.tensor(y_test).float()
    X_val, y_val = torch.from_numpy(X_val).float(), torch.tensor(y_val).float()

    X_train = X_train.to(device)
    y_train = y_train.to(device)
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    X_val = X_val.to(device)
    y_val = y_val.to(device)
    
    # make dataloader
    train_dataset = oversampdata(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    
    
    # define model instance
    rnn_net = RNN(X_train[0].shape[1], 8, 1, n_layers = 1)
    rnn_net_best_val = RNN(X_train[0].shape[1], 8, 1, n_layers = 1)

    # set loss function, optimizer and criterion
    criterion = nn.BCELoss()
    optimizer = optim.Adam(rnn_net.parameters(), lr=0.01)
    loss_fn = nn.MSELoss()

    # set training to gpu if exists
    rnn_net.to(device)
    rnn_net_best_val.to(device)

    # training parameters
    n_steps = 10
    best_val_loss = 100

    # train model
    for i in range(n_steps):
        for i, data in enumerate(train_loader, 0):        
            inputs, labels = data        
            optimizer.zero_grad()
            y_pred_train = rnn_net(inputs)
            loss_train = criterion(y_pred_train,labels.unsqueeze(1))        
            loss_train.backward()        
            optimizer.step()

        # validation loss
        y_pred_val = rnn_net(X_val)
        loss_val = criterion(y_pred_val, y_val.unsqueeze(1))
        if (best_val_loss > loss_val.cpu().detach().numpy()): 
            rnn_net_best_val.load_state_dict(rnn_net.state_dict())
            best_val_loss = loss_val.cpu().detach().numpy()
            #print("Updated best val model")

    y_eval = rnn_net_best_val(X_test)
    print("Binary accuracy: ", binary_acc(y_eval, y_test))

    y_pred_binary = torch.round(y_eval.cpu().squeeze(1))

    print(classification_report(y_test.cpu().detach().numpy(), y_pred_binary.detach().numpy()))
    print()
    
    return 0

In [63]:
# loop through all the courses to test leave out prediction
for index, row in courses_df.iterrows(): 
    # define course 
    code_module = row['code_module']
    code_presentation = row['code_presentation']
    test_course = (code_module, code_presentation)
    print(test_course)
    
    # extract test course
    test_indecies = [course_module == test_course for course_module in course_modules]
    train_indecies = [course_module != test_course for course_module in course_modules]

    X_train, X_test = X[train_indecies], X[test_indecies]
    y_train, y_test = y[train_indecies], y[test_indecies]
    
    loop_course_testing(X_train, X_test, y_train, y_test)

('AAA', '2013J')
Binary accuracy:  tensor(0.9048)
              precision    recall  f1-score   support

         0.0       0.88      0.74      0.80       100
         1.0       0.91      0.96      0.94       278

    accuracy                           0.90       378
   macro avg       0.90      0.85      0.87       378
weighted avg       0.90      0.90      0.90       378


('AAA', '2014J')
Binary accuracy:  tensor(0.8873)
              precision    recall  f1-score   support

         0.0       0.89      0.70      0.78       102
         1.0       0.89      0.96      0.92       253

    accuracy                           0.89       355
   macro avg       0.89      0.83      0.85       355
weighted avg       0.89      0.89      0.88       355


('BBB', '2013J')
Binary accuracy:  tensor(0.9117)
              precision    recall  f1-score   support

         0.0       0.97      0.81      0.89       785
         1.0       0.88      0.98      0.93      1072

    accuracy                  

# STEM vs Social Science

In [24]:
social_science = ['AAA', 'BBB', 'GGG']
stem = ['CCC', 'DDD', 'EEE', 'FFF']

In [47]:
social_indecies = [course_module[0] in social_science for course_module in course_modules]
stem_indicies = [course_module[0] in stem for course_module in course_modules]

In [53]:
X_social, y_social = X[social_indecies], y[social_indecies]
X_stem, y_stem = X[stem_indicies], y[stem_indicies]

In [54]:
social_courses, stem_courses = [], []
for course_module in course_modules: 
    if(course_module[0] in social_science): 
        social_courses.append(course_module)
    else: 
        stem_courses.append(course_module)

In [55]:
len(stem_courses), len(X_stem)

(19330, 19330)

In [57]:
set(stem_courses)

{('CCC', '2014B'),
 ('CCC', '2014J'),
 ('DDD', '2013B'),
 ('DDD', '2013J'),
 ('DDD', '2014B'),
 ('DDD', '2014J'),
 ('EEE', '2013J'),
 ('EEE', '2014B'),
 ('EEE', '2014J'),
 ('FFF', '2013B'),
 ('FFF', '2013J'),
 ('FFF', '2014B'),
 ('FFF', '2014J')}

## Stem

In [60]:
# loop through all the courses to test leave out prediction
for test_course in set(stem_courses): 
    # define course 
    print(test_course)
    
    # extract test course
    test_indecies = [course_module == test_course for course_module in stem_courses]
    train_indecies = [course_module != test_course for course_module in stem_courses]

    X_train, X_test = X_stem[train_indecies], X_stem[test_indecies]
    y_train, y_test = y_stem[train_indecies], y_stem[test_indecies]
    
    loop_course_testing(X_train, X_test, y_train, y_test)

('CCC', '2014B')
Binary accuracy:  tensor(0.9151)
              precision    recall  f1-score   support

         0.0       0.95      0.90      0.93       998
         1.0       0.86      0.94      0.90       663

    accuracy                           0.92      1661
   macro avg       0.91      0.92      0.91      1661
weighted avg       0.92      0.92      0.92      1661


('DDD', '2013J')
Binary accuracy:  tensor(0.8976)
              precision    recall  f1-score   support

         0.0       0.97      0.83      0.90       928
         1.0       0.84      0.97      0.90       829

    accuracy                           0.90      1757
   macro avg       0.90      0.90      0.90      1757
weighted avg       0.91      0.90      0.90      1757


('FFF', '2013J')
Binary accuracy:  tensor(0.9433)
              precision    recall  f1-score   support

         0.0       0.99      0.89      0.94       986
         1.0       0.91      0.99      0.95      1095

    accuracy                  

## Social Science

In [61]:
# loop through all the courses to test leave out prediction
for test_course in set(social_courses): 
    # define course 
    print(test_course)
    
    # extract test course
    test_indecies = [course_module == test_course for course_module in social_courses]
    train_indecies = [course_module != test_course for course_module in social_courses]

    X_train, X_test = X_social[train_indecies], X_social[test_indecies]
    y_train, y_test = y_social[train_indecies], y_social[test_indecies]
    
    loop_course_testing(X_train, X_test, y_train, y_test)

('GGG', '2013J')
Binary accuracy:  tensor(0.8935)
              precision    recall  f1-score   support

         0.0       0.99      0.69      0.81       300
         1.0       0.86      0.99      0.93       592

    accuracy                           0.89       892
   macro avg       0.93      0.84      0.87       892
weighted avg       0.91      0.89      0.89       892


('BBB', '2013B')
Binary accuracy:  tensor(0.8675)
              precision    recall  f1-score   support

         0.0       0.86      0.86      0.86       721
         1.0       0.88      0.87      0.87       803

    accuracy                           0.87      1524
   macro avg       0.87      0.87      0.87      1524
weighted avg       0.87      0.87      0.87      1524


('BBB', '2014J')
Binary accuracy:  tensor(0.8850)
              precision    recall  f1-score   support

         0.0       0.84      0.88      0.86       763
         1.0       0.92      0.89      0.90      1150

    accuracy                  

# Half way-clicks predictions

In [67]:
half_vle_period = int(X.shape[2]/2)
X_half = X[:, :, :half_vle_period]

In [68]:
# loop through all the courses to test leave out prediction
for test_course in sorted(list(set(course_modules))): 
    # define course 
    print(test_course)
    
    # extract test course
    test_indecies = [course_module == test_course for course_module in course_modules]
    train_indecies = [course_module != test_course for course_module in course_modules]

    X_train, X_test = X_half[train_indecies], X_half[test_indecies]
    y_train, y_test = y[train_indecies], y[test_indecies]
    
    loop_course_testing(X_train, X_test, y_train, y_test)

('AAA', '2013J')
Binary accuracy:  tensor(0.8095)
              precision    recall  f1-score   support

         0.0       0.73      0.44      0.55       100
         1.0       0.82      0.94      0.88       278

    accuracy                           0.81       378
   macro avg       0.78      0.69      0.71       378
weighted avg       0.80      0.81      0.79       378


('AAA', '2014J')
Binary accuracy:  tensor(0.8423)
              precision    recall  f1-score   support

         0.0       0.84      0.56      0.67       102
         1.0       0.84      0.96      0.90       253

    accuracy                           0.84       355
   macro avg       0.84      0.76      0.78       355
weighted avg       0.84      0.84      0.83       355


('BBB', '2013B')
Binary accuracy:  tensor(0.7986)
              precision    recall  f1-score   support

         0.0       0.91      0.64      0.75       721
         1.0       0.74      0.95      0.83       803

    accuracy                  

## Stem

In [69]:
X_stem_half = X_stem[:, :, :half_vle_period]
# loop through all the courses to test leave out prediction
for test_course in set(stem_courses): 
    # define course 
    print(test_course)
    
    # extract test course
    test_indecies = [course_module == test_course for course_module in stem_courses]
    train_indecies = [course_module != test_course for course_module in stem_courses]

    X_train, X_test = X_stem_half[train_indecies], X_stem_half[test_indecies]
    y_train, y_test = y_stem[train_indecies], y_stem[test_indecies]
    
    loop_course_testing(X_train, X_test, y_train, y_test)

('CCC', '2014B')
Binary accuracy:  tensor(0.8043)
              precision    recall  f1-score   support

         0.0       0.93      0.73      0.82       998
         1.0       0.69      0.92      0.79       663

    accuracy                           0.80      1661
   macro avg       0.81      0.82      0.80      1661
weighted avg       0.84      0.80      0.81      1661


('DDD', '2013J')
Binary accuracy:  tensor(0.7951)
              precision    recall  f1-score   support

         0.0       0.89      0.70      0.78       928
         1.0       0.73      0.90      0.81       829

    accuracy                           0.80      1757
   macro avg       0.81      0.80      0.79      1757
weighted avg       0.81      0.80      0.79      1757


('FFF', '2013J')
Binary accuracy:  tensor(0.8284)
              precision    recall  f1-score   support

         0.0       0.88      0.74      0.80       986
         1.0       0.79      0.91      0.85      1095

    accuracy                  

## Social Science

In [70]:
X_social_half = X_social[:, :, :half_vle_period]
# loop through all the courses to test leave out prediction
for test_course in set(social_courses): 
    # define course 
    print(test_course)
    
    # extract test course
    test_indecies = [course_module == test_course for course_module in social_courses]
    train_indecies = [course_module != test_course for course_module in social_courses]

    X_train, X_test = X_social_half[train_indecies], X_social_half[test_indecies]
    y_train, y_test = y_social[train_indecies], y_social[test_indecies]
    
    loop_course_testing(X_train, X_test, y_train, y_test)

('GGG', '2013J')
Binary accuracy:  tensor(0.7791)
              precision    recall  f1-score   support

         0.0       0.78      0.48      0.60       300
         1.0       0.78      0.93      0.85       592

    accuracy                           0.78       892
   macro avg       0.78      0.71      0.72       892
weighted avg       0.78      0.78      0.76       892


('BBB', '2013B')
Binary accuracy:  tensor(0.7959)
              precision    recall  f1-score   support

         0.0       0.91      0.63      0.74       721
         1.0       0.74      0.95      0.83       803

    accuracy                           0.80      1524
   macro avg       0.83      0.79      0.79      1524
weighted avg       0.82      0.80      0.79      1524


('BBB', '2014J')
Binary accuracy:  tensor(0.8014)
              precision    recall  f1-score   support

         0.0       0.88      0.58      0.70       763
         1.0       0.77      0.95      0.85      1150

    accuracy                  