# CFAR Feature Extract

In [206]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from utils import *

## Train Set

In [51]:
# Get the data:
train_filepath = 'data/train.csv'
traindf = pd.read_table(train_filepath)

  This is separate from the ipykernel package so we can avoid doing imports until


In [52]:
# Inspect the schema
traindf.columns

Index(['Row', 'Anon Student Id', 'Problem Hierarchy', 'Problem Name',
       'Problem View', 'Step Name', 'Step Start Time',
       'First Transaction Time', 'Correct Transaction Time', 'Step End Time',
       'Step Duration (sec)', 'Correct Step Duration (sec)',
       'Error Step Duration (sec)', 'Correct First Attempt', 'Incorrects',
       'Hints', 'Corrects', 'KC(Default)', 'Opportunity(Default)'],
      dtype='object')

In [None]:
# Inspect the head of data
traindf.head(100)

In [54]:
# Apply df as alias for training data in the following computation
df = traindf
df.dtypes

Row                              int64
Anon Student Id                 object
Problem Hierarchy               object
Problem Name                    object
Problem View                     int64
Step Name                       object
Step Start Time                 object
First Transaction Time          object
Correct Transaction Time        object
Step End Time                   object
Step Duration (sec)            float64
Correct Step Duration (sec)    float64
Error Step Duration (sec)      float64
Correct First Attempt            int64
Incorrects                       int64
Hints                            int64
Corrects                         int64
KC(Default)                     object
Opportunity(Default)            object
dtype: object

In [216]:
# Clean up in dataframe, to remove redundant columns
# maybe some columns are deprecated, and no more used,
# fill them out of the following list and run this cell
# to do cleanup
columns_to_remove = ["CFA | Problem Name,Step Name"]
df = df.drop(columns=columns_to_remove)
df.columns

Index(['Row', 'Anon Student Id', 'Problem Hierarchy', 'Problem Name',
       'Problem View', 'Step Name', 'Step Start Time',
       'First Transaction Time', 'Correct Transaction Time', 'Step End Time',
       'Step Duration (sec)', 'Correct Step Duration (sec)',
       'Error Step Duration (sec)', 'Correct First Attempt', 'Incorrects',
       'Hints', 'Corrects', 'KC(Default)', 'Opportunity(Default)',
       'CFA | Anon Student Id', 'CFA | Step Name', 'CFA | Problem Name',
       'CFA | KC(Default)', 'CFA | Anon Student Id,Problem Name',
       'CFA | Anon Student Id,KC(Default)'],
      dtype='object')

## Split out problem unit and problem section

In [218]:
split_unit = lambda row: row["Problem Hierarchy"].split(',')[0].strip()
split_section = lambda row: row["Problem Hierarchy"].split(',')[1].strip()
df.insert(3, "Problem Unit", df.apply(split_unit, axis="columns"))
df.insert(4, "Problem Section", df.apply(split_section, axis="columns"))

## Correct First Attemp Rate

In [90]:
def CFAR(df, idx, columns, newcol, itercount, totalcount):
    if len(columns) == 0:
        cfa = df.loc[idx & (df["Correct First Attempt"] == 1)]
        df.loc[idx, newcol] = len(cfa) / len(df.loc[idx]) if len(df.loc[idx]) != 0 else 0.0
        
        # Update progress
        print("%.1f%%" % (itercount / totalcount * 100.0), end='\r')
        itercount+=1
    else:
        col = columns[0]
        keys = set(df[col])
        print("%s: %d" % (col, len(keys)))
        for k in keys:
            if idx is None:
                local_idx = (df[col] == k)
            else:
                local_idx = idx & (df[col] == k)
            CFAR(df, local_idx, columns[1:], newcol)
            
def computeCFAR(df, columns):
    newcol = nameOfCFAR(columns)
    df[newcol] = pd.Series(np.zeros(len(df)), index=df.index)
    total = 1
    for col in columns:
        total *= len(set(df[col]))
    CFAR(df, None, columns, newcol, 0, total)

In [91]:
# CFA | Student Name
computeCFAR(df, ["Anon Student Id"])

99.4%

In [95]:
# CFA | Step Name
computeCFAR(df, ["Step Name"])

100.0%

In [92]:
# CFA | Problem Name
computeCFAR(df, ["Problem Name"])

99.9%

In [93]:
# CFA | KC
computeCFAR(df, ["KC(Default)"])

99.7%

In [156]:
# CFA | Student Name, Problem Name
computeCFAR(df, ["Anon Student Id", "Problem Name"])

100.0%

In [220]:
# CFA | Student Name, Problem Unit
computeCFAR(df, ["Anon Student Id", "Problem Unit"])

100.0%

In [94]:
# CFA | Student Name, KC
computeCFAR(df, ["Anon Student Id", "KC(Default)"])

100.0%

In [221]:
# Inspect the head of new features
df.head(100)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Unit,Problem Section,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,...,Corrects,KC(Default),Opportunity(Default),CFA | Anon Student Id,CFA | Step Name,CFA | Problem Name,CFA | KC(Default),"CFA | Anon Student Id,Problem Name","CFA | Anon Student Id,KC(Default)","CFA | Anon Student Id,Problem Unit"
0,9938,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R1C1,2005-09-09 12:23:34.0,2005-09-09 12:24:07.0,...,1,,,0.748749,0.840631,0.710197,0.000000,0.666667,0.000000,0.762376
1,9939,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R1C2,2005-09-09 12:24:07.0,2005-09-09 12:24:22.0,...,1,,,0.748749,0.830699,0.710197,0.000000,0.666667,0.000000,0.762376
2,9940,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R3C1,2005-09-09 12:24:22.0,2005-09-09 12:25:16.0,...,1,Define Variable,1,0.748749,0.966979,0.710197,0.966979,0.666667,0.965517,0.762376
3,9941,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R3C2,2005-09-09 12:25:40.0,2005-09-09 12:25:51.0,...,1,"Using small numbers~~Write expression, positiv...",1~~1~~1,0.748749,0.404477,0.710197,0.443541,0.666667,0.413793,0.762376
4,9942,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R4C1,2005-09-09 12:27:24.0,2005-09-09 12:27:30.0,...,1,Entering a given,1,0.748749,0.760107,0.710197,0.817953,0.666667,0.815287,0.762376
5,9943,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R5C1,2005-09-09 12:27:30.0,2005-09-09 12:27:41.0,...,1,Entering a given,2,0.748749,0.755559,0.710197,0.817953,0.666667,0.815287,0.762376
6,9944,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R6C1,2005-09-09 12:27:41.0,2005-09-09 12:28:41.0,...,1,Entering a given,3,0.748749,0.694107,0.710197,0.817953,0.666667,0.815287,0.762376
7,9945,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R7C2,2005-09-09 12:28:50.0,2005-09-09 12:28:58.0,...,1,Entering a given,4,0.748749,0.757615,0.710197,0.817953,0.666667,0.815287,0.762376
8,9946,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,ValidEquations,2005-09-09 12:31:39.0,2005-09-09 12:31:52.0,...,2,,,0.748749,0.880437,0.710197,0.000000,0.666667,0.000000,0.762376
9,9947,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,7/10*X = 1400,2005-09-09 12:31:52.0,2005-09-09 12:32:28.0,...,1,,,0.748749,0.571429,0.710197,0.000000,0.666667,0.000000,0.762376


In [222]:
# Save these precious features into disk !!
df.to_csv("data/CFAR_features_train.csv", sep='\t')

## Test Set

In [100]:
# Get the test data:
test_filepath = 'data/test.csv'
testdf = pd.read_table(test_filepath)

  This is separate from the ipykernel package so we can avoid doing imports until


In [228]:
testdf.insert(3, "Problem Unit", df.apply(split_unit, axis="columns"))
testdf.insert(4, "Problem Section", df.apply(split_section, axis="columns"))

In [229]:
# Inspect schema
testdf.columns
# Inspect head of testdf
testdf.head(100)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Unit,Problem Section,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,...,Hints,Corrects,KC(Default),Opportunity(Default),CFA | Anon Student Id,CFA | Step Name,CFA | Problem Name,CFA | KC(Default),"CFA | Anon Student Id,Problem Name","CFA | Anon Student Id,KC(Default)"
14,21616,8o0mJnIpFH,"Unit CTA1_06, Section CTA1_06-3",Unit CTA1_13,Section CTA1_13-1,JAN05,2,R5C1,,,...,,,Using simple numbers~~Using large numbers~~Fin...,24~~15~~18,0.820012,0.755559,0.777679,0.686542,0.750000,0.631579
15,22178,8o0mJnIpFH,"Unit ES_03, Section ES_03-6",Unit CTA1_13,Section CTA1_13-1,EG51A,2,constant termsR2,,,...,,,,,0.820012,0.870237,0.885030,0.000000,0.809524,0.000000
16,22320,8o0mJnIpFH,"Unit CTA1_08, Section CTA1_08-2",Unit CTA1_13,Section CTA1_13-1,FEB03,1,yMin,,,...,,,Changing axis bounds,98,0.820012,0.818575,0.763982,0.775749,0.833333,0.904762
17,22862,8o0mJnIpFH,"Unit ES_04, Section ES_04-15",Unit CTA1_13,Section CTA1_13-1,LIT58,1,7+y*(b+n) = r+f,,,...,,,[SkillRule: Eliminate Parens; {CLT nested; CLT...,70,0.820012,0.761905,0.741007,0.865864,0.000000,0.875000
18,23277,8o0mJnIpFH,"Unit CTA1_10, Section CTA1_10-5",Unit CTA1_13,Section CTA1_13-1,DISTFB08_SP,1,R3C2,,,...,,,"Using simple numbers~~Write expression, positi...",78~~45~~56,0.820012,0.404477,0.806701,0.460606,1.000000,0.125000
19,23522,8o0mJnIpFH,"Unit ES_07, Section ES_07-4",Unit CTA1_13,Section CTA1_13-1,LIT63A,1,n*t-h*r = c*j+z*n,,,...,,,"[SkillRule: Remove constant; {ax+b=c, positive...",228,0.820012,0.769231,0.821018,0.787275,0.000000,0.827586
20,23621,8o0mJnIpFH,"Unit CTA1_12, Section CTA1_12-2",Unit CTA1_13,Section CTA1_13-1,SYS03,1,R3C3,,,...,,,Using simple numbers~~Using small numbers~~Wri...,94~~104~~55,0.820012,0.445225,0.667472,0.518980,0.625000,0.666667
21,35682,g209g5Vve6,"Unit CTA1_08, Section CTA1_08-3",Unit CTA1_13,Section CTA1_13-1,REAL37,1,Slope Field 1,,,...,,,Entering the slope,20,0.707545,0.787045,0.823810,0.787045,1.000000,0.684211
22,36252,g209g5Vve6,"Unit ES_04, Section ES_04-14",Unit CTA1_13,Section CTA1_13-1,LIT23A,1,(n*j+56)/n = r/n,,,...,,,[SkillRule: Multiply/Divide; [Typein Skill: {R...,28,0.707545,0.000000,0.784946,0.912086,1.000000,0.982759
23,36745,g209g5Vve6,"Unit CTA1_10, Section CTA1_10-4",Unit CTA1_13,Section CTA1_13-1,DIST02_SP,1,ValidEquations,,,...,,,,,0.707545,0.880437,0.829431,0.000000,0.777778,0.000000


In [127]:
# Query CFAR through the specified columns as keys
def queryCFAR(row, df, columns):
    idx = None
    CFAR = 0.0
    for col in columns:
        if idx is None:
            idx = (df[col] == row[col])
        else:
            idx &= (df[col] == row[col])
    sub = df.loc[idx]
    if len(sub) != 0:
        CFAR = sub.iloc[0].loc[nameOfCFAR(columns)]
    return CFAR

In [235]:
CFAR_features = [ ["Anon Student Id"], ["Step Name"], ["Problem Name"], ["KC(Default)"]
    , ["Anon Student Id", "Problem Name"], ["Anon Student Id", "Problem Unit"], ["Anon Student Id", "KC(Default)"]
]
for f in CFAR_features:
    testdf[nameOfCFAR(f)] = testdf.apply(queryCFAR, axis='columns', args=(df, f))

In [236]:
# Inspect the head of new features
testdf.head(100)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Unit,Problem Section,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,...,Corrects,KC(Default),Opportunity(Default),CFA | Anon Student Id,CFA | Step Name,CFA | Problem Name,CFA | KC(Default),"CFA | Anon Student Id,Problem Name","CFA | Anon Student Id,KC(Default)","CFA | Anon Student Id,Problem Unit"
14,21616,8o0mJnIpFH,"Unit CTA1_06, Section CTA1_06-3",Unit CTA1_13,Section CTA1_13-1,JAN05,2,R5C1,,,...,,Using simple numbers~~Using large numbers~~Fin...,24~~15~~18,0.820012,0.755559,0.777679,0.686542,0.750000,0.631579,0.000000
15,22178,8o0mJnIpFH,"Unit ES_03, Section ES_03-6",Unit CTA1_13,Section CTA1_13-1,EG51A,2,constant termsR2,,,...,,,,0.820012,0.870237,0.885030,0.000000,0.809524,0.000000,0.000000
16,22320,8o0mJnIpFH,"Unit CTA1_08, Section CTA1_08-2",Unit CTA1_13,Section CTA1_13-1,FEB03,1,yMin,,,...,,Changing axis bounds,98,0.820012,0.818575,0.763982,0.775749,0.833333,0.904762,0.000000
17,22862,8o0mJnIpFH,"Unit ES_04, Section ES_04-15",Unit CTA1_13,Section CTA1_13-1,LIT58,1,7+y*(b+n) = r+f,,,...,,[SkillRule: Eliminate Parens; {CLT nested; CLT...,70,0.820012,0.761905,0.741007,0.865864,0.000000,0.875000,0.000000
18,23277,8o0mJnIpFH,"Unit CTA1_10, Section CTA1_10-5",Unit CTA1_13,Section CTA1_13-1,DISTFB08_SP,1,R3C2,,,...,,"Using simple numbers~~Write expression, positi...",78~~45~~56,0.820012,0.404477,0.806701,0.460606,1.000000,0.125000,0.000000
19,23522,8o0mJnIpFH,"Unit ES_07, Section ES_07-4",Unit CTA1_13,Section CTA1_13-1,LIT63A,1,n*t-h*r = c*j+z*n,,,...,,"[SkillRule: Remove constant; {ax+b=c, positive...",228,0.820012,0.769231,0.821018,0.787275,0.000000,0.827586,0.000000
20,23621,8o0mJnIpFH,"Unit CTA1_12, Section CTA1_12-2",Unit CTA1_13,Section CTA1_13-1,SYS03,1,R3C3,,,...,,Using simple numbers~~Using small numbers~~Wri...,94~~104~~55,0.820012,0.445225,0.667472,0.518980,0.625000,0.666667,0.000000
21,35682,g209g5Vve6,"Unit CTA1_08, Section CTA1_08-3",Unit CTA1_13,Section CTA1_13-1,REAL37,1,Slope Field 1,,,...,,Entering the slope,20,0.707545,0.787045,0.823810,0.787045,1.000000,0.684211,0.000000
22,36252,g209g5Vve6,"Unit ES_04, Section ES_04-14",Unit CTA1_13,Section CTA1_13-1,LIT23A,1,(n*j+56)/n = r/n,,,...,,[SkillRule: Multiply/Divide; [Typein Skill: {R...,28,0.707545,0.000000,0.784946,0.912086,1.000000,0.982759,0.000000
23,36745,g209g5Vve6,"Unit CTA1_10, Section CTA1_10-4",Unit CTA1_13,Section CTA1_13-1,DIST02_SP,1,ValidEquations,,,...,,,,0.707545,0.880437,0.829431,0.000000,0.777778,0.000000,0.000000


In [238]:
# Save these precious features into disk !!
testdf.to_csv("data/CFAR_features_test.csv", sep='\t')