## Setting Custom Parameters

Modify the following parameters according to your folder organization. Example files for each folder can be found in this repository.

In [1]:
INSTRUCTOR_SOLUTIONS = "./instructor_solutions"
STUDENT_INTERACTION_DATA = "./student_interaction_data"
OUTPUT_FOLDER = "./datasets/experiment"

Select the number of students to be sampled from the set. Must be less than the total number of students in the dataset.

In [2]:
STUDENT_SAMPLE_SIZE = 400

List of semesters the data was collected from. Must match the names of the subfolders in STUDENT_INTERACTION_DATA.

In [3]:
LIST_OF_SEMESTERS = ['semester']

## Imports

In [4]:
import os

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from graphs import *

## Generate ASTs for Solution Space

In [5]:
sol_path = INSTRUCTOR_SOLUTIONS
sol_trees = {}
solutions = {}
for file in tqdm(os.listdir(sol_path)):
    try:
        with open(os.path.join(sol_path, file), 'rb') as f:
            program = f.read().decode("utf-8-sig").encode("utf-8")
        solutions[file[:-3]] = [program]
    except:
        print("Error while reading instructor solution:", file)

  0%|          | 0/461 [00:00<?, ?it/s]

In [6]:
def return_correct_submissions(problemName):
    '''
    Helper function for filtering a DataFrame for correct student submissions.
    '''
    return data[(data['Score'] == 1.0) & (data['ProblemName'] == problemName)]['Input'].unique()

In [7]:
for root, dirs, files in os.walk(STUDENT_INTERACTION_DATA):
    if len(dirs) > 0:
        continue
    users = {} 
    np.random.seed(42)
    random_set = None
    for file in tqdm(files):
        data = pd.read_csv(os.path.join(root, file))
        if len(data) <= 0:
            continue
        for problemName in data['ProblemName'].unique():
            problemInput = return_correct_submissions(problemName)
            try:
                solutions[problemName].extend(problemInput)
            except:
                print("Error while collecting the solutions for problem:", problemName)

  0%|          | 0/13 [00:00<?, ?it/s]

Error while collecting the solutions for problem: progPrint_F
Error while collecting the solutions for problem: progReturnSpecificNumber


  0%|          | 0/13 [00:00<?, ?it/s]

Error while collecting the solutions for problem: progPrint
Error while collecting the solutions for problem: progReturnSpecificNumber


  0%|          | 0/13 [00:00<?, ?it/s]

Error while collecting the solutions for problem: progPrint_C
Error while collecting the solutions for problem: progReturnSpecificNumber


  0%|          | 0/13 [00:00<?, ?it/s]

Error while collecting the solutions for problem: progPrint
Error while collecting the solutions for problem: progReturnSpecificNumber


  0%|          | 0/15 [00:00<?, ?it/s]

Error while collecting the solutions for problem: progPrint


  0%|          | 0/14 [00:00<?, ?it/s]

Error while collecting the solutions for problem: progPrint
Error while collecting the solutions for problem: progReturnSpecificNumber


  0%|          | 0/13 [00:00<?, ?it/s]

Error while collecting the solutions for problem: progPrint_F
Error while collecting the solutions for problem: progReturnSpecificNumber


In [8]:
solutions_embedded = {}
solutions_traversal = {}
tfidf_params = {}
for problem in tqdm(solutions):
    def parser_handler(content):
        try:
            return ast.parse(content)
        except:
            print("Error while parsing AST - returning empty tree...")
            return ast.Module()
    ast_trees = [parser_handler(solution) for solution in solutions[problem]]
    tfidf_vectors, all_nodes, idf = compute_tfidf(ast_trees)
    solutions_embedded[problem] = tfidf_vectors
    solutions_traversal[problem] = [dfs_traversal(tree) for tree in ast_trees]
    tfidf_params[problem] = (all_nodes, idf) 
    

  0%|          | 0/461 [00:00<?, ?it/s]

Error while parsing AST - returning empty tree...
Error while parsing AST - returning empty tree...
Error while parsing AST - returning empty tree...
Error while parsing AST - returning empty tree...
Error while parsing AST - returning empty tree...
Error while parsing AST - returning empty tree...
Error while parsing AST - returning empty tree...
Error while parsing AST - returning empty tree...
Error while parsing AST - returning empty tree...
Error while parsing AST - returning empty tree...


## Compare Student Submission to the Solution Space

In [9]:
cache = {}
def get_list_of_nodes(programName, dataInput):
    if programName not in solutions:
        return ":"

    min_op = 50
    try:
        submission = ast.parse(dataInput)
        raise Exception
    except:
        submission = None
        target = ast.parse(solutions[programName][0])
        incorrect_ops = tree_edit_distance_with_operations(target, submission)
    else:
        # find two closest candidates for comparing the trees
        submission_vec = compute_tfidf_ood(submission, *tfidf_params[programName])
        dfs_tree = dfs_traversal(submission)
        d_tfidf = []
        for vec in solutions_embedded[programName]:
            d_tfidf.append(euclidean_distance(submission_vec, vec))

        sort_counter = 0
        indice = np.argmin(d_tfidf)
        while True:
            sort_counter += 1
            try:
                sol_tfidf = solutions[programName][indice]
                _ = ast.parse(sol_tfidf)
                break
            except:
                indice = np.argsort(d_tfidf)[sort_counter:][0]


        d_align = []
        for tree in (solutions_traversal[programName]):
            d_align.append(calculate_dissimilarity(dfs_tree, tree))
        sol_align = solutions[programName][np.argmin(d_align)]
        ops_tfidf = tree_edit_distance_with_operations(ast.parse(sol_tfidf), submission)

        incorrect_ops = ops_tfidf
        target = ast.parse(sol_tfidf)
        

    correct_ops = set_of_children(target).difference(incorrect_ops)
    l = ','.join(list(correct_ops) + list(incorrect_ops)) + ':' + ','.join('1'*len(correct_ops)+'0'*len(incorrect_ops))
    return l

## Generate Transaction Data

In [10]:
cols = [
    'Transaction Id',
    'Anon Student Id',
    'Session Id', # set to 1 for all
    'Time',
    'Level (Unit)', # homework no
    'Problem Name',
    'Problem Start Time',
    'Input',
    'Step Name',
    'Outcome',
    'KC (Binary-Node)',
    'KC Category (Binary-Node)'
]

In [11]:
from hashlib import md5

def string_hash(string):
    return md5(string.encode()).hexdigest()


In [12]:
from collections import defaultdict
counters = defaultdict(int)

user_counts = {semester:{} for semester in LIST_OF_SEMESTERS}

import string
table = str.maketrans('', '', string.ascii_lowercase)

def user_counter(anonid, semester):
    if anonid not in user_counts[semester]:
        counters[semester] += 1
        user_counts[semester][anonid] = f"{semester.translate(table)}-S{counters[semester]:05d}"
    return user_counts[semester][anonid]

In [13]:
def populate_import_csv(data, semester, random_set=None):
    df = pd.DataFrame()
    df[cols[1]] = data['AnonID'].apply(user_counter, args=(semester,))
    
    if random_set is not None:
        df = df[df['Anon Student Id'].isin(random_set)]
    
    df[cols[2]] = 1
    df[cols[3]] = data['Timestamp']
    df[cols[4]] = data['Assessment']
    df[cols[5]] = data['ProblemName']
    df[cols[6]] = data['Timestamp'] # str apply [:-6]
    tqdm.pandas()

    series = data.progress_apply(lambda x: get_list_of_nodes(x['ProblemName'], x['Input']), axis=1)

    df[cols[9]] = series.apply(lambda x: x.split(':')[1])
    df[cols[10]] = series.apply(lambda x: x.split(':')[0])

    df[cols[11]] = ''
    df = df.drop_duplicates(subset=[cols[1], cols[5]])
    df = df.assign(**{cols[10]: df[cols[10]].str.split(','),
                      cols[9]: df[cols[9]].str.split(',').apply(lambda x: x if '' in x else [float(i) for i in x])}).explode(
        [cols[10], cols[9]])
    df[cols[9]] = df[cols[9]].apply(lambda x: 'CORRECT' if x == 1.0 else 'INCORRECT')
    df[cols[8]] = df[cols[10]]

    df.reset_index()
    return df

In [14]:
student_dict = {}

In [15]:
for root, dirs, files in os.walk(STUDENT_INTERACTION_DATA):
    if len(dirs) > 0:
        continue
    users = {} 
    np.random.seed(42)
    random_set = None
    student_list = []
    for file in tqdm(files):
        data = pd.read_csv(os.path.join(root, file))
        if len(data) <= 0:
            continue
        df = populate_import_csv(data, root.split('/')[-1], random_set)
        if random_set is None:
            random_set = np.random.choice(df['Anon Student Id'].unique(), STUDENT_SAMPLE_SIZE, replace=False)
            df = df[df['Anon Student Id'].isin(random_set)]
        student_list.extend(df['Anon Student Id'].unique())
        os.makedirs(os.path.join(OUTPUT_FOLDER, root.split('/')[-1]), exist_ok=True)
        df.to_csv(os.path.join(OUTPUT_FOLDER, root.split('/')[-1], file[:-4] + '.txt'), sep='\t')


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/7623 [00:00<?, ?it/s]

  0%|          | 0/9080 [00:00<?, ?it/s]

  0%|          | 0/9924 [00:00<?, ?it/s]

  0%|          | 0/11837 [00:00<?, ?it/s]

  0%|          | 0/9159 [00:00<?, ?it/s]

  0%|          | 0/10605 [00:00<?, ?it/s]

  0%|          | 0/19780 [00:00<?, ?it/s]

  0%|          | 0/21157 [00:00<?, ?it/s]

  0%|          | 0/16350 [00:00<?, ?it/s]

  0%|          | 0/11470 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

KeyError: 'Fall22'