In [4]:
import os
import sys
from CodeDKT import *
from torch.utils.data import DataLoader
from readdata import data_reader, StudentDataset
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../../Data")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../..")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))

from Data import *
from choosedataset import *
from helper import * 
from sklearn.model_selection import train_test_split


In [None]:
class Config:
    def __init__(self):
        self.length = 100
        self.lr = 0.0001
        self.bs = 32
        self.epochs = 15
        self.hidden = 128
        self.layers = 1
        self.code_path_length = 8
        self.code_path_width = 2
        self.dataset = 0
        self.padding_size_code = 100

In [6]:
def create_questions_dict(df):
    all_future_q = set()
    for i in df['new_task_id']:
        all_future_q.add(i)

    all_prev_q = set()
    for i in df['prev_tasks_id']:
        all_prev_q = all_prev_q.union(set(i))
    all_problems = all_future_q.union(all_prev_q)
    return {name: idx for idx, name in enumerate(all_problems)}

In [7]:
device_name = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)

In [8]:
config = Config()

In [None]:
data = [Codeworkout, Falcon][config.dataset]()
df = data.df
code_df = pd.read_csv("codedkt/labeled_paths_all.tsv",sep="\t")
df['prev_tasks'] = df['prev_tasks'].apply(lambda x: [i[-config.padding_size_code:] for i in x]) # n submissions padding_size_code snapshots

In [10]:
code_df.rename(columns={'student_id': 'SubjectID'}, inplace=True)
code_df.rename(columns={'clean_code': 'Code'}, inplace=True)
question_dict = create_questions_dict(df)

In [11]:
def caculate_1loss(batch, model, device, criterion, loss_fn=None):
    dict_batch = {k: v.to(device) for k, v in batch.items()}
    model_params = {k: v for k, v in dict_batch.items() if k != 'label'}
    logits = model(*model_params.values())
    label = dict_batch['label'].float()
    if not criterion:
        return logits[1], label
    loss = criterion(logits, batch['row'], label)
    del dict_batch, model_params, logits, label
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    return loss

In [28]:
caculate_func = caculate_1loss
criterion = lossFunc(len(question_dict), config.length, device)

# Train-val-test

In [None]:
def create_data_loader(df, dataset, question_dict=None, batch_size=32, create_split=True):        
    # Split the data to train and test by student ID
    if not create_split:
        print("Load exist spliting")
        train_ids, valid_ids, test_ids = load_ids(ids_filepath_prefix)
    else:
        student_id = df['student_id'].unique()
        id_to_struggle = df.groupby('student_id')['Label'].first()
        train_ids, test_ids = train_test_split(student_id, test_size=0.3, stratify=id_to_struggle[student_id])
        valid_ids, test_ids = train_test_split(test_ids, test_size=0.2/0.3, stratify=id_to_struggle[test_ids])
    handler = data_reader(df, code_df, question_dict, config.length, config.questions)
    handler.get_data(train_ids, set(valid_ids).union(set(test_ids)))

    train_df = df[df['student_id'].isin(train_ids)]
    valid_df = df[df['student_id'].isin(valid_ids)]
    test_df = df[df['student_id'].isin(test_ids)]
    
    # Tokenize
    train_dataset = dataset(train_df, handler)
    valid_dataset = dataset(valid_df, handler, "test")
    test_dataset = dataset(test_df, handler, "test")

    # Dataset
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)       
    return train_dataloader, valid_dataloader, test_dataloader

In [17]:
train_dataloader, valid_dataloader, test_dataloader = create_data_loader(df, StudentDataset, question_dict, batch_size=config.bs, create_split=True)

finish test 189
finish train 441


In [18]:
print(len(train_dataloader), len(valid_dataloader), len(test_dataloader), flush=True)
print(train_dataloader.dataset.df['Label'].value_counts())
print(valid_dataloader.dataset.df['Label'].value_counts())
print(test_dataloader.dataset.df['Label'].value_counts())
print(len(set(train_dataloader.dataset.df['student_id'])), len(set(valid_dataloader.dataset.df['student_id'])), len(set(test_dataloader.dataset.df['student_id'])))
print(set(train_dataloader.dataset.df['student_id']).intersection(set(valid_dataloader.dataset.df['student_id'])))
print(set(train_dataloader.dataset.df['student_id']).intersection(set(test_dataloader.dataset.df['student_id'])))
print(set(valid_dataloader.dataset.df['student_id']).intersection(set(test_dataloader.dataset.df['student_id'])))

264 36 76
Label
False    6409
True     2019
Name: count, dtype: int64
Label
False    879
True     266
Name: count, dtype: int64
Label
False    1831
True      591
Name: count, dtype: int64
441 62 127
set()
set()
set()


In [19]:
node_count = train_dataloader.dataset.node_count
path_count = train_dataloader.dataset.path_count

In [26]:
model = c2vRNNModel(config.questions * 2,
                    config.hidden,
                    config.layers,
                    len(question_dict),
                    node_count, path_count, device) 

In [None]:
name = "code-dkt"
model = model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay=1e-4)
model = training_loop(model=model, train_dataloader=train_dataloader, test_dataloader=valid_dataloader, optimizer=optimizer, criterion=criterion, device=device, name=name, caculate_func=caculate_func)

In [21]:
from sklearn.metrics import roc_curve

In [22]:
all_labels, all_probs = eval_loop(model, valid_dataloader, device, caculate_func=caculate_func)

fpr, tpr, thresholds = roc_curve(all_labels, all_probs)

J = tpr - fpr
best_index = J.argmax()
best_threshold = thresholds[best_index]

y_labels, y_probs = eval_loop(model, test_dataloader, device, caculate_func=caculate_func)

Test Batch 0 from 80
Test Batch 0 from 162
Test Batch 100 from 162


In [23]:
def results(threshold, y_true, y_prob):
    y_prob = np.array(y_prob)
    y_true = np.array(y_true)
    y_pred = np.where(y_prob > threshold, 1, 0)
    roc_auc = roc_auc_score(y_true, y_prob)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    best = "best"
    if threshold == 0.5:
        best = "0.5"
    #  df = pd.concat([pd.DataFrame([[model_name, threshold, roc_auc, accuracy, precision, recall, f1]], columns=df.columns), df], ignore_index=True)
    print({"threshold": threshold, "roc_auc": roc_auc, "accuracy": accuracy, f"precision_{best}": precision, f"recall_{best}": recall, f"f1_{best}": f1})
    cm = confusion_matrix(y_true, y_pred)
    print(cm)

In [26]:
results(best_threshold, y_labels, y_probs)

{'threshold': np.float32(0.58912784), 'roc_auc': np.float64(0.5504844865099237), 'accuracy': 0.5019305019305019, 'precision_best': np.float64(0.42108311613090066), 'recall_best': np.float64(0.7144963144963145), 'f1_best': np.float64(0.5298833819241983)}
[[1146 1999]
 [ 581 1454]]


# 5 - fold

In [None]:
def create_data_loader_k_fold(df, dataset, question_dict=None, batch_size=32, k=5):            # Setup k-fold
    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    student_id = df['student_id'].unique()
    id_to_struggle = df.groupby('student_id')['Label'].first()
    data_loaders = []

    # Perform k-fold split
    for train_idx, test_idx in kf.split(student_id, id_to_struggle[student_id]):
        train_students = student_id[train_idx]
        test_students = student_id[test_idx]
        handler = data_reader(df, code_df, question_dict, config.length, config.questions)
        handler.get_data(train_students, test_students)
        # Create train and test DataFrames
        train_df = df[df['student_id'].isin(train_students)]
        test_df = df[df['student_id'].isin(test_students)]

        # Tokenize
        train_dataset = dataset(train_df, handler)
        test_dataset = dataset(test_df, handler, "test")

        # Create DataLoaders
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        # Append to results
        data_loaders.append((train_dataloader, test_dataloader))
    return data_loaders

In [26]:
data_loaders = create_data_loader_k_fold(df, StudentDataset, question_dict, balanced_def=same_df, batch_size=config.bs)

finish test 126
finish train 504
finish test 126
finish train 504
finish test 126
finish train 504
finish test 126
finish train 504
finish test 126
finish train 504


In [None]:
fold_results = {'ROC-AUC' : [], 'f1' : [], 'recall': [], "precision": []}

for fold, (train_dataloader, test_dataloader) in enumerate(data_loaders):
    print(f"Fold {fold + 1}:")    # Prepare data for current fold
    node_count = train_dataloader.dataset.node_count
    path_count = train_dataloader.dataset.path_count

    m = c2vRNNModel(config.questions * 2,
                    config.hidden,
                    config.layers,
                    config.questions,
                    node_count, path_count, device) 
    loss_fn = None
    optimizer = torch.optim.Adam(m.parameters(), lr=config.lr, weight_decay=1e-4)

    m = m.to(device)
    print(m)
    # Training Loop
    for epoch in range(config.epochs):
        total_loss = train_loop(m, train_dataloader, device, optimizer, criterion, caculate_func)

        # Optional: Print metrics every few epochs
        if epoch % 10 == 0:
            print(f"Fold {fold + 1}, Epoch {epoch}: Loss = {total_loss / len(train_dataloader)}")

    y_labels, y_probs = eval_loop(m, test_dataloader, device, caculate_func=caculate_func)
    y_prob = np.array(y_probs)
    y_true = np.array(y_labels)
    y_pred = np.where(y_prob > 0.25, 1, 0)

    fold_results['ROC-AUC'].append(roc_auc_score(y_true, y_prob))
    fold_results['precision'].append(precision_score(y_true, y_pred))
    fold_results['recall'].append(recall_score(y_true, y_pred))
    fold_results['f1'].append(f1_score(y_true, y_pred))

In [15]:
fold_results

{'ROC-AUC': [np.float64(0.7038260755229249),
  np.float64(0.713637798528403),
  np.float64(0.6709280024788047),
  np.float64(0.7061674180904653),
  np.float64(0.7144690792863939)],
 'f1': [np.float64(0.4151067323481117),
  np.float64(0.37339635381498987),
  np.float64(0.4390728476821192),
  np.float64(0.3487352445193929),
  np.float64(0.3549843695727683)],
 'recall': [np.float64(1.0),
  np.float64(1.0),
  np.float64(1.0),
  np.float64(1.0),
  np.float64(1.0)],
 'precision': [np.float64(0.2619146290924161),
  np.float64(0.22955583229555832),
  np.float64(0.28128977513788717),
  np.float64(0.21119281045751634),
  np.float64(0.21579391891891891)]}