# Feature Extract

In [8]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [9]:
# Get the data:
train_filepath = 'data/train.csv'
traindata = pd.read_table(train_filepath)

  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
# Inspect the schema
traindata.columns

Index(['Row', 'Anon Student Id', 'Problem Hierarchy', 'Problem Name',
       'Problem View', 'Step Name', 'Step Start Time',
       'First Transaction Time', 'Correct Transaction Time', 'Step End Time',
       'Step Duration (sec)', 'Correct Step Duration (sec)',
       'Error Step Duration (sec)', 'Correct First Attempt', 'Incorrects',
       'Hints', 'Corrects', 'KC(Default)', 'Opportunity(Default)'],
      dtype='object')

In [11]:
# Inspect the head of data
traindata.head(10)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,Step Duration (sec),Correct Step Duration (sec),Error Step Duration (sec),Correct First Attempt,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default)
0,9938,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R1C1,2005-09-09 12:23:34.0,2005-09-09 12:24:07.0,2005-09-09 12:24:07.0,2005-09-09 12:24:07.0,33.0,33.0,,1,0,0,1,,
1,9939,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R1C2,2005-09-09 12:24:07.0,2005-09-09 12:24:22.0,2005-09-09 12:24:22.0,2005-09-09 12:24:22.0,15.0,15.0,,1,0,0,1,,
2,9940,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R3C1,2005-09-09 12:24:22.0,2005-09-09 12:25:16.0,2005-09-09 12:25:40.0,2005-09-09 12:25:40.0,78.0,,78.0,0,2,0,1,Define Variable,1
3,9941,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R3C2,2005-09-09 12:25:40.0,2005-09-09 12:25:51.0,2005-09-09 12:27:24.0,2005-09-09 12:27:24.0,104.0,,104.0,0,4,9,1,"Using small numbers~~Write expression, positiv...",1~~1~~1
4,9942,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R4C1,2005-09-09 12:27:24.0,2005-09-09 12:27:30.0,2005-09-09 12:27:30.0,2005-09-09 12:27:30.0,6.0,6.0,,1,0,0,1,Entering a given,1
5,9943,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R5C1,2005-09-09 12:27:30.0,2005-09-09 12:27:41.0,2005-09-09 12:27:41.0,2005-09-09 12:27:41.0,11.0,11.0,,1,0,0,1,Entering a given,2
6,9944,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R6C1,2005-09-09 12:27:41.0,2005-09-09 12:28:41.0,2005-09-09 12:28:41.0,2005-09-09 12:28:41.0,60.0,60.0,,1,0,0,1,Entering a given,3
7,9945,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R7C2,2005-09-09 12:28:50.0,2005-09-09 12:28:58.0,2005-09-09 12:28:58.0,2005-09-09 12:28:58.0,8.0,8.0,,1,0,0,1,Entering a given,4
8,9946,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,ValidEquations,2005-09-09 12:31:39.0,2005-09-09 12:31:52.0,2005-09-09 12:31:52.0,2005-09-09 12:31:52.0,13.0,13.0,,1,0,0,2,,
9,9947,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,7/10*X = 1400,2005-09-09 12:31:52.0,2005-09-09 12:32:28.0,2005-09-09 12:32:35.0,2005-09-09 12:32:35.0,43.0,,43.0,0,1,0,1,,


In [19]:
# Apply df as alias for training data in the following computation
df = traindata
df.dtypes

Row                              int64
Anon Student Id                 object
Problem Hierarchy               object
Problem Name                    object
Problem View                     int64
Step Name                       object
Step Start Time                 object
First Transaction Time          object
Correct Transaction Time        object
Step End Time                   object
Step Duration (sec)            float64
Correct Step Duration (sec)    float64
Error Step Duration (sec)      float64
Correct First Attempt            int64
Incorrects                       int64
Hints                            int64
Corrects                         int64
KC(Default)                     object
Opportunity(Default)            object
Anon Student Id CFAR           float64
dtype: object

In [20]:
numrows = len(df)
    
# Add a column for every problem unit and fill them with zerosdf["Student_CFAR"] = pd.Series(np.zeros(numrows), index=df.index)
keys = set(df["Anon Student Id"])
df["Anon Student Id CFAR"] = pd.Series(np.zeros(numrows), index=df.index)

for k in keys:
    cfa = df.loc[(df["Anon Student Id"] == k) & (df["Correct First Attempt"] == 1)]
    cond = df.loc[df["Anon Student Id"] == k]
    df.loc[df["Anon Student Id"] == k, "Anon Student Id CFAR"] = len(cfa) / len(cond)


In [24]:
# Inspect the head of data
df.tail(100)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,Step Duration (sec),Correct Step Duration (sec),Error Step Duration (sec),Correct First Attempt,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default),Anon Student Id CFAR
232644,1076832,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS08,1,(3x^2)(4x^4),2006-03-22 09:45:51.0,2006-03-22 09:47:30.0,2006-03-22 09:47:30.0,2006-03-22 09:47:30.0,99.0,99.0,,1,0,0,1,,,0.830189
232645,1076833,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS08,1,FinalAnswer,2006-03-22 09:47:30.0,2006-03-22 09:47:49.0,2006-03-22 09:47:49.0,2006-03-22 09:47:49.0,19.0,19.0,,1,0,0,1,perform-mult-sp,7,0.830189
232646,1076834,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS07,1,(2y^4)(y^3),2006-03-22 09:48:01.0,2006-03-22 09:48:08.0,2006-03-22 09:48:08.0,2006-03-22 09:48:08.0,7.0,7.0,,1,0,0,1,,,0.830189
232647,1076835,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS07,1,FinalAnswer,2006-03-22 09:48:08.0,2006-03-22 09:48:18.0,2006-03-22 09:48:18.0,2006-03-22 09:48:18.0,10.0,10.0,,1,0,0,1,perform-mult-sp,8,0.830189
232648,1076836,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS09,1,(4y^4)(2y^5),2006-03-22 09:48:25.0,2006-03-22 09:48:58.0,2006-03-22 09:49:05.0,2006-03-22 09:49:05.0,40.0,,40.0,0,1,0,1,,,0.830189
232649,1076837,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS09,1,FinalAnswer,2006-03-22 09:49:05.0,2006-03-22 09:49:23.0,2006-03-22 09:49:23.0,2006-03-22 09:49:23.0,18.0,18.0,,1,0,0,1,perform-mult-sp,9,0.830189
232650,1076838,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS09,2,(4y^6)(3y^5),2006-03-22 09:49:30.0,2006-03-22 09:49:36.0,2006-03-22 09:49:36.0,2006-03-22 09:49:36.0,6.0,6.0,,1,0,0,1,,,0.830189
232651,1076839,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS09,2,FinalAnswer,2006-03-22 09:49:36.0,2006-03-22 09:49:46.0,2006-03-22 09:49:46.0,2006-03-22 09:49:46.0,10.0,10.0,,1,0,0,1,perform-mult-sp,10,0.830189
232652,1076840,e95f4UtF4I,"Unit EXPT-PRODUCT-SIMP-A_ES, Section EXPT-PROD...",EG-EPS01-FIXED,1,(4(x^2)(y^4))(3(x^3)(y^3)),2006-03-22 09:49:53.0,2006-03-22 09:50:01.0,2006-03-22 09:50:01.0,2006-03-22 09:50:01.0,8.0,8.0,,1,0,0,1,,,0.830189
232653,1076848,e95f4UtF4I,"Unit EXPT-QUOTIENT-SIMP-A_ES, Section EXPT-QUO...",EG-EQS04-FIXED,1,(x^4)/x^2,2006-03-22 09:52:26.0,2006-03-22 09:52:43.0,2006-03-22 09:52:43.0,2006-03-22 09:52:43.0,17.0,17.0,,1,0,0,1,,,0.830189
