In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
import sklearn
from sklearn.preprocessing import LabelEncoder
from bisect import bisect

In [3]:
action_df = pd.read_csv('datashop_data.csv', index_col='Unnamed: 0')

In [4]:
# test a small sample of 10k actions
action_df = action_df[:10000]

In [5]:
action_df = action_df[['Anon Student Id', 'Time', 'Level (Workspace Id)', 'Problem Name', 'Step Name', 
                       'Outcome', 'Help Level', 'Attempt At Step', 'KC Model(MATHia)', 'CF (Skill New p-Known)',
                       'CF (Semantic Event Id)']]
action_df.columns = ['user_id', 'server_time', 'section_id', 'problem_id', 'goalnode_id', 
                     'tutor_outcome', 'help_level', 'attempt', 'skill', 'pknow', 'semantic_event_id']

In [6]:

def pknown_direct(row):
    if row[0]==1:
        return row[1]
    else:
        return -1

def incorrect(x):
    if x in ['OK', 'OK_AMBIGUOUS']:
        return 0
    else:
        return 1
def problem_step_count_last5(window):
    return (window == window.iloc[-1]).sum()
def prob_step_first_att(row):
    if (row[0], row[1]) in prob_step_set:
        return 0
    else:
        prob_step_set.add((row[0], row[1]))
        return 1

def step_first_att(row):
    if row in step_set:
        return 0
    else:
        step_set.add(row)
        return 1

def getFeatures(df):
    feature_df = df.copy()
    if len(feature_df) - len(feature_df.dropna())>0:
        raise ValueError('The input DataFrame contains NaNs')
    feature_df['duration'] = feature_df.server_time.diff().to_numpy()/1000
    feature_df.loc[feature_df[feature_df['user_id'].shift() != feature_df['user_id']].index, 'duration'] = np.nan
    feature_df.drop(feature_df[feature_df.duration<feature_df.duration.quantile(0.025)].index, inplace=True)
    feature_df.drop(feature_df[feature_df.duration>feature_df.duration.quantile(0.975)].index, inplace=True)
    feature_df['assess_OK'] = feature_df.tutor_outcome.apply(lambda x: 1 if x=='OK' else 0)
    feature_df['assess_BUG'] = feature_df.tutor_outcome.apply(lambda x: 1 if x=='JIT' else 0)
    feature_df['assess_ERROR'] = feature_df.tutor_outcome.apply(lambda x: 1 if x=='ERROR' else 0)
    feature_df['assess_INITIAL_HINT'] = feature_df.tutor_outcome.apply(lambda x: 1 if x=='INITIAL_HINT' else 0)
    feature_df['duration_sd'] = feature_df[['duration']].apply(zscore, nan_policy='omit')
    global prob_step_set, step_set
    prob_step_set = set()
    step_set = set()
    feature_df['prob_first_att'] = feature_df[['problem_id', 'goalnode_id']].apply(prob_step_first_att, axis=1).values
    feature_df['step_first_att'] = feature_df.goalnode_id.apply(step_first_att).values
    correct_attempts = feature_df[['user_id', 'goalnode_id', 'assess_OK']].groupby(['user_id', 'goalnode_id']).sum()
    count_attempts = feature_df[['user_id', 'goalnode_id', 'assess_OK']].groupby(['user_id', 'goalnode_id']).count()
    wrong_attempts = count_attempts - correct_attempts
    error_perc = wrong_attempts/count_attempts
    feature_df['wrong_attempts'] = feature_df[['user_id', 'goalnode_id']].apply(lambda x: wrong_attempts.loc[(x[0], x[1]), 'assess_OK'], axis=1)
    feature_df['error_perc'] = feature_df[['user_id', 'goalnode_id']].apply(lambda x: error_perc.loc[(x[0], x[1]), 'assess_OK'], axis=1)
    max_attempts = feature_df[['user_id', 'problem_id', 'goalnode_id', 'attempt']].groupby(['user_id', 'problem_id', 'goalnode_id']).max()[['attempt']]
    feature_df['numsteps'] = feature_df[['user_id', 'problem_id', 'goalnode_id']].apply(lambda x: max_attempts.loc[(x[0], x[1], x[2]), 'attempt'], axis=1)
    feature_df['help_or_error'] = feature_df.tutor_outcome.apply(incorrect)
    help_or_error = feature_df[['user_id', 'goalnode_id', 'help_or_error']].groupby(['user_id', 'goalnode_id']).sum()[['help_or_error']]
    feature_df['help_and_errors_count'] = feature_df[['user_id', 'goalnode_id']].apply(lambda x: help_or_error.loc[(x[0], x[1]), 'help_or_error'], axis=1)
    feature_df['error_count'] = feature_df['assess_BUG'] + feature_df['assess_ERROR']
    feature_df['error_count_last_5'] = feature_df.error_count.rolling(5).sum()
    feature_df['dur_sd_prev3'] = feature_df.duration_sd.rolling(3).sum()
    feature_df['dur_sd_prev5'] = feature_df.duration_sd.rolling(5).sum()
    feature_df['assess_HINT_LEVEL_CHANGE'] = feature_df.tutor_outcome.apply(lambda x: 1 if x=='HINT_LEVEL_CHANGE' else 0)
    feature_df['help_attempts_last_8'] = feature_df.assess_HINT_LEVEL_CHANGE.rolling(8).sum()
    le = LabelEncoder()
    feature_df['goalnode_id'] = feature_df['problem_id']+feature_df['goalnode_id']
    feature_df['encoded_goalnodes'] = le.fit_transform(feature_df.goalnode_id)
    feature_df['prob_step_last_5'] = feature_df[['encoded_goalnodes']].rolling(5).apply(problem_step_count_last5)
    return feature_df

def getClipsIDs(feat_df):
    feat = feat_df.copy()
    refined_clips = {}
    for size in [8,7,6,5,4,3,2,1]:
        clips = feat.loc[feat.loc[~feat.index.duplicated(keep='first')].duration.rolling(size).sum()<=20].index
        if len(clips)==0:
            continue
        refined_clips[size] = [clips[0]]
        for i in clips:
            if i-refined_clips[size][-1]>=size:
                refined_clips[size].append(i)
        dropIDs = []
        uniqueIDs = feat.loc[~feat.index.duplicated(keep='first')].index.to_list()
        for clip_id in refined_clips[size]:
            pos = bisect(uniqueIDs, clip_id)
            dropIDs.extend(uniqueIDs[pos-size:pos])
        feat = feat.drop(index=dropIDs)
    return refined_clips

def getSimpleClipsIDs(feat_df):
    feat = feat_df.copy()
    refined_clips = {}
    for size in [8,7,6,5,4,3,2]:
        clips = feat.loc[feat.duration.rolling(size).sum()<=20].index.to_list()
        if len(clips)==0:
            continue
        refined_clips[size] = [clips[0]]
        for i in clips:
            if i-refined_clips[size][-1]>=size:
                refined_clips[size].append(i)
        dropIDs = []
        uniqueIDs = feat.index.to_list()
        for clip_id in refined_clips[size]:
            pos = bisect(uniqueIDs, clip_id)
            dropIDs.extend(uniqueIDs[pos-size:pos])
        feat = feat.drop(index=dropIDs)
    for k, v in refined_clips.items():
        refined_clips[k] = feat_df.loc[v]['semantic_event_id'].to_list()
    return refined_clips

In [7]:
df_to_input = action_df.dropna().drop_duplicates().reset_index(drop=True)
features = getFeatures(df_to_input.sort_values(['user_id', 'server_time']))
features['pknow_direct'] = features[['attempt', 'pknow']].apply(pknown_direct, axis=1)
features['pknow'] = features.pknow.astype(float)
features['pknow_direct'] = features.pknow_direct.astype(float)
feats_per_user = {uid: features[features.user_id==uid] for uid in features.user_id.unique()}
clipIDs = {user_id: getClipsIDs(feats_per_user[user_id]) for user_id in feats_per_user.keys()}

In [8]:
def processClips(feat_df, clipIDs):
    feat = feat_df.copy()
    clips = feat.head().describe().unstack().to_frame().T
    funique = feat[~feat.index.duplicated()]
    for size in range(2, 9):
        if size in clipIDs.keys():
            for ix in clipIDs[size]:
                start = funique.loc[:ix].tail(size).index[0]
                cur = feat.loc[start:ix].describe().unstack().to_frame().T
                cur.index=[ix]
                clips = pd.concat((clips, cur))
    clips.columns = [f'{i[0]}_{i[1]}' for i in clips.columns]
    for f in feat.columns:
        clips[(f+'_sum')] = clips[(f+'_mean')] * clips[(f+'_count')]
    return clips

In [9]:
processed = {sid:processClips(feats_per_user[sid].drop(
    ['user_id', 'problem_id', 'tutor_outcome',
    'goalnode_id', 'server_time', 'attempt', 'help_level',
    'section_id', 'skill', 'help_or_error', 'semantic_event_id',
    'encoded_goalnodes', 'error_count'], axis=1), clipIDs[sid]) for sid in feats_per_user.keys()}

In [10]:
# the order of features that the classifiers expect
order = ['assess_BUG_25%', 'assess_BUG_50%', 'assess_BUG_75%',
       'assess_BUG_count', 'assess_BUG_max', 'assess_BUG_mean',
       'assess_BUG_min', 'assess_BUG_std', 'assess_BUG_sum',
       'assess_OK_25%', 'assess_OK_50%', 'assess_OK_75%',
       'assess_OK_count', 'assess_OK_max', 'assess_OK_mean',
       'assess_OK_min', 'assess_OK_std', 'assess_OK_sum',
       'assess_ERROR_25%', 'assess_ERROR_50%', 'assess_ERROR_75%',
       'assess_ERROR_count', 'assess_ERROR_max', 'assess_ERROR_mean',
       'assess_ERROR_min', 'assess_ERROR_std', 'assess_ERROR_sum',
       'assess_INITIAL_HINT_25%', 'assess_INITIAL_HINT_50%',
       'assess_INITIAL_HINT_75%', 'assess_INITIAL_HINT_count',
       'assess_INITIAL_HINT_max', 'assess_INITIAL_HINT_mean',
       'assess_INITIAL_HINT_min', 'assess_INITIAL_HINT_std',
       'assess_INITIAL_HINT_sum', 'dur_sd_prev3_25%', 'dur_sd_prev3_50%',
       'dur_sd_prev3_75%', 'dur_sd_prev3_count', 'dur_sd_prev3_max',
       'dur_sd_prev3_mean', 'dur_sd_prev3_min', 'dur_sd_prev3_std',
       'dur_sd_prev3_sum', 'dur_sd_prev5_25%', 'dur_sd_prev5_50%',
       'dur_sd_prev5_75%', 'dur_sd_prev5_count', 'dur_sd_prev5_max',
       'dur_sd_prev5_mean', 'dur_sd_prev5_min', 'dur_sd_prev5_std',
       'dur_sd_prev5_sum', 'duration_sd_25%', 'duration_sd_50%',
       'duration_sd_75%', 'duration_sd_count', 'duration_sd_max',
       'duration_sd_mean', 'duration_sd_min', 'duration_sd_std',
       'duration_sd_sum', 'error_count_last_5_25%',
       'error_count_last_5_50%', 'error_count_last_5_75%',
       'error_count_last_5_count', 'error_count_last_5_max',
       'error_count_last_5_mean', 'error_count_last_5_min',
       'error_count_last_5_std', 'error_count_last_5_sum',
       'error_perc_25%', 'error_perc_50%', 'error_perc_75%',
       'error_perc_count', 'error_perc_max', 'error_perc_mean',
       'error_perc_min', 'error_perc_std', 'error_perc_sum',
       'help_and_errors_count_25%', 'help_and_errors_count_50%',
       'help_and_errors_count_75%', 'help_and_errors_count_count',
       'help_and_errors_count_max', 'help_and_errors_count_mean',
       'help_and_errors_count_min', 'help_and_errors_count_std',
       'help_and_errors_count_sum', 'help_attempts_last_8_25%',
       'help_attempts_last_8_50%', 'help_attempts_last_8_75%',
       'help_attempts_last_8_count', 'help_attempts_last_8_max',
       'help_attempts_last_8_mean', 'help_attempts_last_8_min',
       'help_attempts_last_8_std', 'help_attempts_last_8_sum',
       'numsteps_25%', 'numsteps_50%', 'numsteps_75%', 'numsteps_count',
       'numsteps_max', 'numsteps_mean', 'numsteps_min', 'numsteps_std',
       'numsteps_sum', 'pknow_25%', 'pknow_50%', 'pknow_75%',
       'pknow_count', 'pknow_direct_25%', 'pknow_direct_50%',
       'pknow_direct_75%', 'pknow_direct_count', 'pknow_direct_max',
       'pknow_direct_mean', 'pknow_direct_min', 'pknow_direct_std',
       'pknow_direct_sum', 'pknow_max', 'pknow_mean', 'pknow_min',
       'pknow_std', 'pknow_sum', 'prob_first_att_25%',
       'prob_first_att_50%', 'prob_first_att_75%', 'prob_first_att_count',
       'prob_first_att_max', 'prob_first_att_mean', 'prob_first_att_min',
       'prob_first_att_std', 'prob_first_att_sum', 'prob_step_last_5_25%',
       'prob_step_last_5_50%', 'prob_step_last_5_75%',
       'prob_step_last_5_count', 'prob_step_last_5_max',
       'prob_step_last_5_mean', 'prob_step_last_5_min',
       'prob_step_last_5_std', 'prob_step_last_5_sum',
       'step_first_att_25%', 'step_first_att_50%', 'step_first_att_75%',
       'step_first_att_count', 'step_first_att_max',
       'step_first_att_mean', 'step_first_att_min', 'step_first_att_std',
       'step_first_att_sum', 'wrong_attempts_25%', 'wrong_attempts_50%',
       'wrong_attempts_75%', 'wrong_attempts_count', 'wrong_attempts_max',
       'wrong_attempts_mean', 'wrong_attempts_min', 'wrong_attempts_std',
       'wrong_attempts_sum']

In [11]:
%store -r classifiers

In [12]:
classifiers[-3].predict(processed['some_student_id'][order].dropna())[:100]

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0])