# Feature Extract

In [4]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [51]:
# Get the data:
train_filepath = 'data/train.csv'
traindf = pd.read_table(train_filepath)

  This is separate from the ipykernel package so we can avoid doing imports until


In [52]:
# Inspect the schema
traindf.columns

Index(['Row', 'Anon Student Id', 'Problem Hierarchy', 'Problem Name',
       'Problem View', 'Step Name', 'Step Start Time',
       'First Transaction Time', 'Correct Transaction Time', 'Step End Time',
       'Step Duration (sec)', 'Correct Step Duration (sec)',
       'Error Step Duration (sec)', 'Correct First Attempt', 'Incorrects',
       'Hints', 'Corrects', 'KC(Default)', 'Opportunity(Default)'],
      dtype='object')

In [53]:
# Inspect the head of data
traindf.head(10)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,Step Duration (sec),Correct Step Duration (sec),Error Step Duration (sec),Correct First Attempt,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default)
0,9938,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R1C1,2005-09-09 12:23:34.0,2005-09-09 12:24:07.0,2005-09-09 12:24:07.0,2005-09-09 12:24:07.0,33.0,33.0,,1,0,0,1,,
1,9939,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R1C2,2005-09-09 12:24:07.0,2005-09-09 12:24:22.0,2005-09-09 12:24:22.0,2005-09-09 12:24:22.0,15.0,15.0,,1,0,0,1,,
2,9940,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R3C1,2005-09-09 12:24:22.0,2005-09-09 12:25:16.0,2005-09-09 12:25:40.0,2005-09-09 12:25:40.0,78.0,,78.0,0,2,0,1,Define Variable,1
3,9941,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R3C2,2005-09-09 12:25:40.0,2005-09-09 12:25:51.0,2005-09-09 12:27:24.0,2005-09-09 12:27:24.0,104.0,,104.0,0,4,9,1,"Using small numbers~~Write expression, positiv...",1~~1~~1
4,9942,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R4C1,2005-09-09 12:27:24.0,2005-09-09 12:27:30.0,2005-09-09 12:27:30.0,2005-09-09 12:27:30.0,6.0,6.0,,1,0,0,1,Entering a given,1
5,9943,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R5C1,2005-09-09 12:27:30.0,2005-09-09 12:27:41.0,2005-09-09 12:27:41.0,2005-09-09 12:27:41.0,11.0,11.0,,1,0,0,1,Entering a given,2
6,9944,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R6C1,2005-09-09 12:27:41.0,2005-09-09 12:28:41.0,2005-09-09 12:28:41.0,2005-09-09 12:28:41.0,60.0,60.0,,1,0,0,1,Entering a given,3
7,9945,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R7C2,2005-09-09 12:28:50.0,2005-09-09 12:28:58.0,2005-09-09 12:28:58.0,2005-09-09 12:28:58.0,8.0,8.0,,1,0,0,1,Entering a given,4
8,9946,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,ValidEquations,2005-09-09 12:31:39.0,2005-09-09 12:31:52.0,2005-09-09 12:31:52.0,2005-09-09 12:31:52.0,13.0,13.0,,1,0,0,2,,
9,9947,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,7/10*X = 1400,2005-09-09 12:31:52.0,2005-09-09 12:32:28.0,2005-09-09 12:32:35.0,2005-09-09 12:32:35.0,43.0,,43.0,0,1,0,1,,


In [54]:
# Apply df as alias for training data in the following computation
df = traindf
df.dtypes

Row                              int64
Anon Student Id                 object
Problem Hierarchy               object
Problem Name                    object
Problem View                     int64
Step Name                       object
Step Start Time                 object
First Transaction Time          object
Correct Transaction Time        object
Step End Time                   object
Step Duration (sec)            float64
Correct Step Duration (sec)    float64
Error Step Duration (sec)      float64
Correct First Attempt            int64
Incorrects                       int64
Hints                            int64
Corrects                         int64
KC(Default)                     object
Opportunity(Default)            object
dtype: object

In [36]:
# Clean up in dataframe, to remove redundant columns
# maybe some columns are deprecated, and no more used,
# fill them out of the following list and run this cell
# to do cleanup
columns_to_remove = ["Anon Student Id CFAR", "CFA | A,n,o,n, ,S,t,u,d,e,n,t, ,I,d"]
df = df.drop(columns=columns_to_remove)
df.columns

Index(['Row', 'Anon Student Id', 'Problem Hierarchy', 'Problem Name',
       'Problem View', 'Step Name', 'Step Start Time',
       'First Transaction Time', 'Correct Transaction Time', 'Step End Time',
       'Step Duration (sec)', 'Correct Step Duration (sec)',
       'Error Step Duration (sec)', 'Correct First Attempt', 'Incorrects',
       'Hints', 'Corrects', 'KC(Default)', 'Opportunity(Default)',
       'CFA | Anon Student Id'],
      dtype='object')

In [90]:
def CFAR(df, idx, columns, newcol, itercount, totalcount):
    if len(columns) == 0:
        cfa = df.loc[idx & (df["Correct First Attempt"] == 1)]
        df.loc[idx, newcol] = len(cfa) / len(df.loc[idx]) if len(df.loc[idx]) != 0 else 0.0
        
        # Update progress
        print("%.1f%%" % (itercount / totalcount * 100.0), end='\r')
        itercount+=1
    else:
        col = columns[0]
        keys = set(df[col])
        for k in keys:
            if idx is None:
                local_idx = (df[col] == k)
            else:
                local_idx = idx & (df[col] == k)
            itercount = CFAR(df, local_idx, columns[1:], newcol, itercount, totalcount)
    return itercount
            
def nameOfCFAR(columns):
    return "CFA | " + ','.join(columns)
            
def computeCFAR(df, columns):
    newcol = nameOfCFAR(columns)
    df[newcol] = pd.Series(np.zeros(len(df)), index=df.index)
    total = 1
    for col in columns:
        total *= len(set(df[col]))
    CFAR(df, None, columns, newcol, 0, total)

In [91]:
# CFA | Student Name
computeCFAR(df, ["Anon Student Id"])

99.4%

In [95]:
# CFA | Step Name
computeCFAR(df, ["Step Name"])

100.0%

In [92]:
# CFA | Problem Name
computeCFAR(df, ["Problem Name"])

99.9%

In [93]:
# CFA | KC
computeCFAR(df, ["KC(Default)"])

99.7%

In [156]:
# CFA | Student Name, Problem Name
computeCFAR(df, ["Anon Student Id", "Problem Name"])

100.0%

In [94]:
# CFA | Student Name, KC
computeCFAR(df, ["Anon Student Id", "KC(Default)"])

100.0%

In [69]:
# Inspect the head of new features
df.head(100)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,...,Hints,Corrects,KC(Default),Opportunity(Default),CFA | Anon Student Id,CFA | Step Name,CFA | Problem Name,CFA | KC(Default),"CFA | Anon Student Id,Problem Name","CFA | Anon Student Id,KC(Default)"
232644,1076832,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS08,1,(3x^2)(4x^4),2006-03-22 09:45:51.0,2006-03-22 09:47:30.0,2006-03-22 09:47:30.0,2006-03-22 09:47:30.0,...,0,1,,,0.830189,1.000000,0.939252,0.000000,1.000000,0.000000
232645,1076833,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS08,1,FinalAnswer,2006-03-22 09:47:30.0,2006-03-22 09:47:49.0,2006-03-22 09:47:49.0,2006-03-22 09:47:49.0,...,0,1,perform-mult-sp,7,0.830189,0.649245,0.939252,0.867826,1.000000,0.900000
232646,1076834,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS07,1,(2y^4)(y^3),2006-03-22 09:48:01.0,2006-03-22 09:48:08.0,2006-03-22 09:48:08.0,2006-03-22 09:48:08.0,...,0,1,,,0.830189,1.000000,0.943005,0.000000,1.000000,0.000000
232647,1076835,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS07,1,FinalAnswer,2006-03-22 09:48:08.0,2006-03-22 09:48:18.0,2006-03-22 09:48:18.0,2006-03-22 09:48:18.0,...,0,1,perform-mult-sp,8,0.830189,0.649245,0.943005,0.867826,1.000000,0.900000
232648,1076836,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS09,1,(4y^4)(2y^5),2006-03-22 09:48:25.0,2006-03-22 09:48:58.0,2006-03-22 09:49:05.0,2006-03-22 09:49:05.0,...,0,1,,,0.830189,0.000000,0.891566,0.000000,0.750000,0.000000
232649,1076837,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS09,1,FinalAnswer,2006-03-22 09:49:05.0,2006-03-22 09:49:23.0,2006-03-22 09:49:23.0,2006-03-22 09:49:23.0,...,0,1,perform-mult-sp,9,0.830189,0.649245,0.891566,0.867826,0.750000,0.900000
232650,1076838,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS09,2,(4y^6)(3y^5),2006-03-22 09:49:30.0,2006-03-22 09:49:36.0,2006-03-22 09:49:36.0,2006-03-22 09:49:36.0,...,0,1,,,0.830189,1.000000,0.891566,0.000000,0.750000,0.000000
232651,1076839,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS09,2,FinalAnswer,2006-03-22 09:49:36.0,2006-03-22 09:49:46.0,2006-03-22 09:49:46.0,2006-03-22 09:49:46.0,...,0,1,perform-mult-sp,10,0.830189,0.649245,0.891566,0.867826,0.750000,0.900000
232652,1076840,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS01-FIXED,1,(4(x^2)(y^4))(3(x^3)(y^3)),2006-03-22 09:49:53.0,2006-03-22 09:50:01.0,2006-03-22 09:50:01.0,2006-03-22 09:50:01.0,...,0,1,,,0.830189,0.954545,0.939189,0.000000,1.000000,0.000000
232653,1076848,e95f4UtF4I,"Unit EXPT-QUOTIENT-SIMP-A_ES, Section EXPT-QUO...",EG-EQS04-FIXED,1,(x^4)/x^2,2006-03-22 09:52:26.0,2006-03-22 09:52:43.0,2006-03-22 09:52:43.0,2006-03-22 09:52:43.0,...,0,1,,,0.830189,0.604651,0.562914,0.000000,1.000000,0.000000


In [70]:
# Save these precious features into disk !!
df.to_csv("data/CFAR_Features.csv")

# Machine Learning

In [82]:
CFAR_features = [ ["Anon Student Id"], ["Step Name"], ["Problem Name"], ["KC(Default)"]
    , ["Anon Student Id", "Problem Name"], ["Anon Student Id", "KC(Default)"]
]
features = [nameOfCFAR(v) for v in CFAR_features]
labels = ["Correct First Attempt"]
X = traindf[features]
Y = traindf[labels]

## Decision Tree

In [85]:
from sklearn import tree
dt_model = tree.DecisionTreeClassifier()

In [86]:
dt_model = dt_model.fit(X, Y)

## Random Forest

In [197]:
from sklearn import ensemble
est_count = 70
rf_model = ensemble.RandomForestClassifier(n_estimators = est_count, criterion="entropy")

In [198]:
rf_model = rf_model.fit(X, Y)

  """Entry point for launching an IPython kernel.


# Testing

In [186]:
# Root Mean Squared Error
# Here, we consider using numpy as a powerful
# utility to solve the RMSE
def RMSE(P, Y):
    return np.sqrt(np.sum(np.square(P - Y)) / len(Y))

## Generate Features for Test

In [100]:
# Get the test data:
test_filepath = 'data/test.csv'
testdf = pd.read_table(test_filepath)

  This is separate from the ipykernel package so we can avoid doing imports until


In [101]:
# Inspect schema
testdf.columns
# Inspect head of testdf
testdf.head(20)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,Step Duration (sec),Correct Step Duration (sec),Error Step Duration (sec),Correct First Attempt,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default)
0,10039,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP01,1,2/300*X = 10,,,,,,,,,,,,,
1,10482,52vEY7f17k,"Unit CTA1_06, Section CTA1_06-3",FOR05,1,xScale,,,,,,,,,,,,Changing axis intervals,28
2,11024,52vEY7f17k,"Unit ES_03, Section ES_03-6",EG52,1,2.9 = -2x+3.7+x,,,,,,,,,,,,"[SkillRule: Consolidate vars, no coeff; CLT]",27
3,11476,52vEY7f17k,"Unit CTA1_08, Section CTA1_08-3",REAL10,1,yScale,,,,,,,,,,,,Changing axis intervals,65
4,11948,52vEY7f17k,"Unit ES_04, Section ES_04-12",EG58,7,xR2,,,,,,,,,,,,,
5,12532,52vEY7f17k,"Unit CTA1_10, Section CTA1_10-5",DIST05_SP,1,R1C2,,,,,,,,,,,,,
6,12955,52vEY7f17k,"Unit ES_07, Section ES_07-4",LIT69A,4,z*u-n*j+z*g = 16n,,,,,,,,,,,,"[SkillRule: Consolidate vars, any; {Combine va...",42
7,19200,6W08a98ZQV,"Unit ES_02, Section ES_02-8",EG41,2,FinalAnswer,,,,,,,,,,,,perform-mult-sp,9
8,19384,6W08a98ZQV,"Unit CTA1_13, Section CTA1_13-1",PROP10,1,R1C2,,,,,,,,,,,,,
9,19835,6W08a98ZQV,"Unit CTA1_06, Section CTA1_06-3",FOR05,2,XLabel,,,,,,,,,,,,Labelling the axes,33


In [127]:
# Query CFAR through the specified columns as keys
def queryCFAR(row, df, columns):
    idx = None
    CFAR = 0.0
    for col in columns:
        if idx is None:
            idx = (df[col] == row[col])
        else:
            idx &= (df[col] == row[col])
    sub = df.loc[idx]
    if len(sub) != 0:
        CFAR = sub.iloc[0].loc[nameOfCFAR(columns)]
    return CFAR

In [129]:
CFAR_features = [ ["Anon Student Id"], ["Step Name"], ["Problem Name"], ["KC(Default)"]
    , ["Anon Student Id", "Problem Name"], ["Anon Student Id", "KC(Default)"]
]
for f in CFAR_features:
    testdf[nameOfCFAR(f)] = testdf.apply(queryCFAR, axis='columns', args=(traindf, f))

In [130]:
# Inspect the head of new features
testdf.head(100)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,...,Hints,Corrects,KC(Default),Opportunity(Default),CFA | Anon Student Id,CFA | Step Name,CFA | Problem Name,CFA | KC(Default),"CFA | Anon Student Id,Problem Name","CFA | Anon Student Id,KC(Default)"
0,10039,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP01,1,2/300*X = 10,,,,,...,,,,,0.748749,0.800000,0.858006,0.000000,1.000000,0.000000
1,10482,52vEY7f17k,"Unit CTA1_06, Section CTA1_06-3",FOR05,1,xScale,,,,,...,,,Changing axis intervals,28,0.748749,1.000000,0.766129,1.000000,0.666667,1.000000
2,11024,52vEY7f17k,"Unit ES_03, Section ES_03-6",EG52,1,2.9 = -2x+3.7+x,,,,,...,,,"[SkillRule: Consolidate vars, no coeff; CLT]",27,0.748749,0.000000,0.877049,0.895683,1.000000,0.790698
3,11476,52vEY7f17k,"Unit CTA1_08, Section CTA1_08-3",REAL10,1,yScale,,,,,...,,,Changing axis intervals,65,0.748749,1.000000,0.812217,1.000000,1.000000,1.000000
4,11948,52vEY7f17k,"Unit ES_04, Section ES_04-12",EG58,7,xR2,,,,,...,,,,,0.748749,0.666041,0.869446,0.000000,0.805556,0.000000
5,12532,52vEY7f17k,"Unit CTA1_10, Section CTA1_10-5",DIST05_SP,1,R1C2,,,,,...,,,,,0.748749,0.830699,0.807692,0.000000,0.000000,0.000000
6,12955,52vEY7f17k,"Unit ES_07, Section ES_07-4",LIT69A,4,z*u-n*j+z*g = 16n,,,,,...,,,"[SkillRule: Consolidate vars, any; {Combine va...",42,0.748749,0.250000,0.727554,0.566392,0.750000,0.500000
7,19200,6W08a98ZQV,"Unit ES_02, Section ES_02-8",EG41,2,FinalAnswer,,,,,...,,,perform-mult-sp,9,0.777308,0.649245,0.793210,0.867826,0.709091,0.000000
8,19384,6W08a98ZQV,"Unit CTA1_13, Section CTA1_13-1",PROP10,1,R1C2,,,,,...,,,,,0.777308,0.830699,0.844037,0.000000,1.000000,0.000000
9,19835,6W08a98ZQV,"Unit CTA1_06, Section CTA1_06-3",FOR05,2,XLabel,,,,,...,,,Labelling the axes,33,0.777308,0.894722,0.766129,0.859338,0.625000,0.802817


## Test Error

In [159]:
# Extract used columns(features)
features = [nameOfCFAR(v) for v in CFAR_features]
labels = ["Correct First Attempt"]
# Drop unknown values
testdf = testdf.dropna(axis="index", subset=["Correct First Attempt"])
X_ = testdf[features]
Y_ = testdf[labels]

In [199]:
P = rf_model.predict(X_)
P = P.reshape(P.size, 1).astype(float)
Y_ = np.array(Y_).astype(float)
print(RMSE(P, Y_))

0.6003002251876642
