In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

# Feature Extract

In [5]:
# Load training data
train_filepath = "data/train.csv"
traindf = pd.read_csv(train_filepath, sep='\t')
print("Table:", np.shape(traindf))
print("Columns:", traindf.columns)

Table: (232744, 19)
Columns: Index(['Row', 'Anon Student Id', 'Problem Hierarchy', 'Problem Name',
       'Problem View', 'Step Name', 'Step Start Time',
       'First Transaction Time', 'Correct Transaction Time', 'Step End Time',
       'Step Duration (sec)', 'Correct Step Duration (sec)',
       'Error Step Duration (sec)', 'Correct First Attempt', 'Incorrects',
       'Hints', 'Corrects', 'KC(Default)', 'Opportunity(Default)'],
      dtype='object')


## Split unit and section out

In [10]:
# Split by comma
traindf["Problem Unit"] = traindf.apply(lambda row: row["Problem Hierarchy"].split(',')[0].strip(), axis=1)
traindf["Problem Section"] = traindf.apply(lambda row: row["Problem Hierarchy"].split(',')[1].strip(), axis=1)

In [11]:
# Overview of train data
traindf.head(100)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,...,Correct Step Duration (sec),Error Step Duration (sec),Correct First Attempt,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default),Problem Unit,Problem Section
0,9938,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R1C1,2005-09-09 12:23:34.0,2005-09-09 12:24:07.0,2005-09-09 12:24:07.0,2005-09-09 12:24:07.0,...,33.0,,1,0,0,1,,,Unit CTA1_13,Section CTA1_13-1
1,9939,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R1C2,2005-09-09 12:24:07.0,2005-09-09 12:24:22.0,2005-09-09 12:24:22.0,2005-09-09 12:24:22.0,...,15.0,,1,0,0,1,,,Unit CTA1_13,Section CTA1_13-1
2,9940,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R3C1,2005-09-09 12:24:22.0,2005-09-09 12:25:16.0,2005-09-09 12:25:40.0,2005-09-09 12:25:40.0,...,,78.0,0,2,0,1,Define Variable,1,Unit CTA1_13,Section CTA1_13-1
3,9941,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R3C2,2005-09-09 12:25:40.0,2005-09-09 12:25:51.0,2005-09-09 12:27:24.0,2005-09-09 12:27:24.0,...,,104.0,0,4,9,1,"Using small numbers~~Write expression, positiv...",1~~1~~1,Unit CTA1_13,Section CTA1_13-1
4,9942,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R4C1,2005-09-09 12:27:24.0,2005-09-09 12:27:30.0,2005-09-09 12:27:30.0,2005-09-09 12:27:30.0,...,6.0,,1,0,0,1,Entering a given,1,Unit CTA1_13,Section CTA1_13-1
5,9943,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R5C1,2005-09-09 12:27:30.0,2005-09-09 12:27:41.0,2005-09-09 12:27:41.0,2005-09-09 12:27:41.0,...,11.0,,1,0,0,1,Entering a given,2,Unit CTA1_13,Section CTA1_13-1
6,9944,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R6C1,2005-09-09 12:27:41.0,2005-09-09 12:28:41.0,2005-09-09 12:28:41.0,2005-09-09 12:28:41.0,...,60.0,,1,0,0,1,Entering a given,3,Unit CTA1_13,Section CTA1_13-1
7,9945,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R7C2,2005-09-09 12:28:50.0,2005-09-09 12:28:58.0,2005-09-09 12:28:58.0,2005-09-09 12:28:58.0,...,8.0,,1,0,0,1,Entering a given,4,Unit CTA1_13,Section CTA1_13-1
8,9946,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,ValidEquations,2005-09-09 12:31:39.0,2005-09-09 12:31:52.0,2005-09-09 12:31:52.0,2005-09-09 12:31:52.0,...,13.0,,1,0,0,2,,,Unit CTA1_13,Section CTA1_13-1
9,9947,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,7/10*X = 1400,2005-09-09 12:31:52.0,2005-09-09 12:32:28.0,2005-09-09 12:32:35.0,2005-09-09 12:32:35.0,...,,43.0,0,1,0,1,,,Unit CTA1_13,Section CTA1_13-1


## Extract one-hot feature

In [30]:
def one_dim_split(df, col):
    cols = set(df[col])
    numcols = len(cols)
    print("Column %s has %d values, so we will be adding as many columns to this dataframe" % (col, numcols))
    
    # Row number
    numrows = len(df)

    # Create df
    newdf = pd.DataFrame(np.zeros((numrows, numcols)), index=df.index, columns=list(cols))

    # For each value in the dataframe, mark the corresponding column as 1
    for c in cols:
        newdf.loc[df[col] == c, c] = 1

    return newdf

### One-dimensional Feature

#### Problem Units

In [33]:
pu_features = one_dim_split(traindf, "Problem Unit")

Column Problem Unit has 32 values, so we will be adding as many columns to this dataframe


In [34]:
# Save
pu_features.to_csv("data/pu-sparse-features.csv")

#### Problem Sections

In [35]:
ps_features = one_dim_split(traindf, "Problem Section")

Column Problem Section has 138 values, so we will be adding as many columns to this dataframe


In [36]:
# Save
ps_features.to_csv("data/ps-sparse-features.csv")

#### Problem Name

In [39]:
pn_features = one_dim_split(traindf, "Problem Name")

Column Problem Name has 1021 values, so we will be adding as many columns to this dataframe


In [40]:
# Save
pn_features.to_csv("data/pn-sparse-features.csv")

#### Step Name

In [41]:
sn_features = one_dim_split(traindf, "Step Name")

Column Step Name has 60709 values, so we will be adding as many columns to this dataframe


KeyboardInterrupt: 

In [21]:
sn_features.to_csv("data/sn-sparse-features.csv")

KeyboardInterrupt: 