In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math

# Extract Features

In [2]:
# Load training data
train_filepath = "data/train.csv"
traindf = pd.read_csv(train_filepath, sep='\t')
print("Table:", np.shape(traindf))
print("Columns:", traindf.columns)

Table: (232744, 19)
Columns: Index(['Row', 'Anon Student Id', 'Problem Hierarchy', 'Problem Name',
       'Problem View', 'Step Name', 'Step Start Time',
       'First Transaction Time', 'Correct Transaction Time', 'Step End Time',
       'Step Duration (sec)', 'Correct Step Duration (sec)',
       'Error Step Duration (sec)', 'Correct First Attempt', 'Incorrects',
       'Hints', 'Corrects', 'KC(Default)', 'Opportunity(Default)'],
      dtype='object')


In [3]:
# Load testing data
test_filepath = "data/test.csv"
testdf = pd.read_csv(test_filepath, sep='\t')
print("Table:", np.shape(testdf))
print("Columns:", testdf.columns)

Table: (1140, 19)
Columns: Index(['Row', 'Anon Student Id', 'Problem Hierarchy', 'Problem Name',
       'Problem View', 'Step Name', 'Step Start Time',
       'First Transaction Time', 'Correct Transaction Time', 'Step End Time',
       'Step Duration (sec)', 'Correct Step Duration (sec)',
       'Error Step Duration (sec)', 'Correct First Attempt', 'Incorrects',
       'Hints', 'Corrects', 'KC(Default)', 'Opportunity(Default)'],
      dtype='object')


In [4]:
# Concatenate training set and testing set
# First combine two set to ensure the one-hot keys form a universal set of all categories
df = pd.concat((traindf, testdf), axis=0, sort=False)

# Cast columns
df["KC(Default)"] = df["KC(Default)"].astype(str)
df["Opportunity(Default)"] = df["Opportunity(Default)"].astype(str)

In [5]:
print("Table:", df.shape)

Table: (233884, 19)


## Split unit and section out

In [6]:
# Split by comma
df.insert(3, "Problem Unit", df.apply(lambda row: row["Problem Hierarchy"].split(',')[0].strip(), axis=1))
df.insert(4, "Problem Section", df.apply(lambda row: row["Problem Hierarchy"].split(',')[1].strip(), axis=1))

In [7]:
# Overview of train data
print("Table:", df.shape)
df.head()
# df[["Problem Name", "Step Name", "Incorrects", "Hints", "Corrects"]].head(50)

Table: (233884, 21)


Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Unit,Problem Section,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,...,Step End Time,Step Duration (sec),Correct Step Duration (sec),Error Step Duration (sec),Correct First Attempt,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default)
0,9938,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R1C1,2005-09-09 12:23:34.0,2005-09-09 12:24:07.0,...,2005-09-09 12:24:07.0,33.0,33.0,,1.0,0.0,0.0,1.0,,
1,9939,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R1C2,2005-09-09 12:24:07.0,2005-09-09 12:24:22.0,...,2005-09-09 12:24:22.0,15.0,15.0,,1.0,0.0,0.0,1.0,,
2,9940,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R3C1,2005-09-09 12:24:22.0,2005-09-09 12:25:16.0,...,2005-09-09 12:25:40.0,78.0,,78.0,0.0,2.0,0.0,1.0,Define Variable,1
3,9941,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R3C2,2005-09-09 12:25:40.0,2005-09-09 12:25:51.0,...,2005-09-09 12:27:24.0,104.0,,104.0,0.0,4.0,9.0,1.0,"Using small numbers~~Write expression, positiv...",1~~1~~1
4,9942,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",Unit CTA1_13,Section CTA1_13-1,PROP04,1,R4C1,2005-09-09 12:27:24.0,2005-09-09 12:27:30.0,...,2005-09-09 12:27:30.0,6.0,6.0,,1.0,0.0,0.0,1.0,Entering a given,1


## One-hot feature

In [8]:
def one_dim_split___(df, col):
    cols = set(df[col])
    numcols = len(cols)
    print("Column %s has %d values, so we will be adding as many columns to this dataframe" % (col, numcols))
    
    # Row number
    numrows = len(df)

    # Create df
    newdf = pd.DataFrame(np.zeros((numrows, numcols)), index=df.index, columns=list(cols))

    # For each value in the dataframe, mark the corresponding column as 1
    for c in cols:
        newdf.loc[df[col] == c, c] = 1

    return newdf

def one_dim_split(df, col, cata=None):
    dummies = pd.get_dummies(df[col], prefix=cata, sparse=True)
    print("Column %s has %d values, so we will add as many columns to this dataframe." % (col, dummies.shape[1]))
    return dummies

### One-dimensional Feature

In [9]:
# Problem Unit
pu_features = one_dim_split(df, "Problem Unit")
pu_features.head()

Column Problem Unit has 32 values, so we will add as many columns to this dataframe.


Unnamed: 0,Unit CTA1_01,Unit CTA1_02,Unit CTA1_04,Unit CTA1_06,Unit CTA1_08,Unit CTA1_10,Unit CTA1_12,Unit CTA1_13,Unit CTA1_14,Unit CTA1_15,...,Unit ES_06,Unit ES_07,Unit EXPT-PRODUCT-SIMP-A_ES,Unit EXPT-QUOTIENT-SIMP-A_ES,Unit LINEAR-INEQUALITY-GRAPHING,Unit QUAD-ADD-AREA-ALG1,Unit QUAD-VERTICAL-MOTION,Unit QUADRATICS-FACTORING_ES,Unit QUADRATICS-SOLVING_ES,Unit QuadLinearFuncTrans
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Problem Section
ps_features = one_dim_split(df, "Problem Section")
ps_features.head()

Column Problem Section has 138 values, so we will add as many columns to this dataframe.


Unnamed: 0,Section CTA1_01-1,Section CTA1_01-2,Section CTA1_01-3,Section CTA1_01-4,Section CTA1_02-1,Section CTA1_02-2,Section CTA1_02-3,Section CTA1_02-4,Section CTA1_04-1,Section CTA1_04-2,...,Section QLFuncTrans5,Section QLFuncTrans6,Section QLFuncTrans7,Section QLFuncTrans8,Section QLFuncTrans9,Section QUAD-ADD-AREA-ALG1-1,Section QUAD-VERTICAL-MOTION-1,Section QUADRATICS-FACTORING_ES-1,Section QUADRATICS-FACTORING_ES-2,Section QUADRATICS-SOLVING_ES-1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Problem Name
pn_features = one_dim_split(df, "Problem Name")
pn_features.head()

Column Problem Name has 1023 values, so we will add as many columns to this dataframe.


Unnamed: 0,1PTB02,1PTB03,1PTB04,1PTB05,1PTB06,1PTB07,1PTB08,1PTB09,1PTFB10,1PTFB11,...,YGT-4X-10,YGT5X+12,YGTX-7,YLE-3X-13,YLE-7X+20,YLEX+7,YLT-X-4,YLT250,YLT2X-5,YLT3X+5
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Step Name
sn_features = one_dim_split(df, "Step Name")
sn_features.head()

Column Step Name has 60934 values, so we will add as many columns to this dataframe.


Unnamed: 0,((2y-3)/2)/2 = ((10/2)/2)/2,((2y-3)/2)/2 = (5/2)/2,((7r+y)/7)/(1/7) = (b/7)/(1/7),((m*h+56)/m)/(1/m) = (q/m)/(1/m),((s*v+r)/s)/(1/s) = (y/s)/(1/s),((y-3)-m*-7)/m = m*x/m,(-(-13y+1)/13)/(-1/13) = (1/13)/(-1/13),(-(-5)+sqrt(5^2-4*1*6))/2 = H,(-(-5)+sqrt(5^2-4*3*1))/6 = x,(-(-5)-sqrt(5^2-12))/6 = x,...,z-y = c,z-y = c+y-y,z-y = y+c-y,z-z-c = y-z,z/(-u) = -u*y/(-u),z/y = -u,z/y*y = -u*y,z1R1,z2R1,zR2
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Student ID
sid_features = one_dim_split(df, "Anon Student Id")
sid_features.head()

Column Anon Student Id has 174 values, so we will add as many columns to this dataframe.


Unnamed: 0,02i5jCrfQK,0GlR30c2Mt,0KS4yy9G96,0nF0z1401O,12M70dm49w,162lfGskK2,16zN4n2fFO,183ppvOXp8,1bJbgQ32E3,1k3GIfwBw6,...,xH487WMLoS,y5009rpZ12,yA79w15tnU,yB480DNZ70,yDO9pl8GC0,yG447121nm,z2zuhnARi6,z9svx3mA4s,zUlk9c7UEv,zfYl0YECr0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Two-dimensional Features

In [14]:
def join_columns(row, cols):
    return ",".join([str(row[col]) for col in cols])
def joint_name(cols):
    return ",".join(cols)

In [15]:
# Student ID and Problem Unit
cols = ["Anon Student Id", "Problem Unit"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
sid_pu_features = one_dim_split(df, joint_name(cols))
sid_pu_features.head()

Column Anon Student Id,Problem Unit has 1130 values, so we will add as many columns to this dataframe.


Unnamed: 0,"02i5jCrfQK,Unit CTA1_01","02i5jCrfQK,Unit CTA1_02","02i5jCrfQK,Unit CTA1_04","02i5jCrfQK,Unit CTA1_06","02i5jCrfQK,Unit CTA1_08","02i5jCrfQK,Unit CTA1_10","02i5jCrfQK,Unit CTA1_12","02i5jCrfQK,Unit CTA1_13","02i5jCrfQK,Unit CTA1_14","02i5jCrfQK,Unit CTA1_15",...,"zUlk9c7UEv,Unit CTA1_08","zUlk9c7UEv,Unit CTA1_10","zUlk9c7UEv,Unit ES_03","zUlk9c7UEv,Unit ES_04","zfYl0YECr0,Unit CTA1_02","zfYl0YECr0,Unit CTA1_04","zfYl0YECr0,Unit CTA1_06","zfYl0YECr0,Unit CTA1_13","zfYl0YECr0,Unit ES_01","zfYl0YECr0,Unit ES_02"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Problem Unit and Problem Section
cols = ["Problem Unit", "Problem Section"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
pu_ps_features = one_dim_split(df, joint_name(cols))
pu_ps_features.head()

Column Problem Unit,Problem Section has 138 values, so we will add as many columns to this dataframe.


Unnamed: 0,"Unit CTA1_01,Section CTA1_01-1","Unit CTA1_01,Section CTA1_01-2","Unit CTA1_01,Section CTA1_01-3","Unit CTA1_01,Section CTA1_01-4","Unit CTA1_02,Section CTA1_02-1","Unit CTA1_02,Section CTA1_02-2","Unit CTA1_02,Section CTA1_02-3","Unit CTA1_02,Section CTA1_02-4","Unit CTA1_04,Section CTA1_04-1","Unit CTA1_04,Section CTA1_04-2",...,"Unit QuadLinearFuncTrans,Section QLFuncTrans10","Unit QuadLinearFuncTrans,Section QLFuncTrans11","Unit QuadLinearFuncTrans,Section QLFuncTrans2","Unit QuadLinearFuncTrans,Section QLFuncTrans3","Unit QuadLinearFuncTrans,Section QLFuncTrans4","Unit QuadLinearFuncTrans,Section QLFuncTrans5","Unit QuadLinearFuncTrans,Section QLFuncTrans6","Unit QuadLinearFuncTrans,Section QLFuncTrans7","Unit QuadLinearFuncTrans,Section QLFuncTrans8","Unit QuadLinearFuncTrans,Section QLFuncTrans9"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Problem Section and Problem Name
cols = ["Problem Section", "Problem Name"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
ps_pn_features = one_dim_split(df, joint_name(cols))
ps_pn_features.head()

Column Problem Section,Problem Name has 1122 values, so we will add as many columns to this dataframe.


Unnamed: 0,"Section CTA1_01-1,LDEMO_WKST","Section CTA1_01-2,BH1T08","Section CTA1_01-2,BH1T08A","Section CTA1_01-2,BH1T08B","Section CTA1_01-2,BH1T08C","Section CTA1_01-2,BH1T15","Section CTA1_01-2,BH1T15A","Section CTA1_01-2,BH1T15B","Section CTA1_01-2,BH1T15C","Section CTA1_01-2,BH1T17",...,"Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR21","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR22","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR23","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR24","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR25","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR26","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR27","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR28","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR29","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR30"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Problem Name and Step Name
cols = ["Problem Name", "Step Name"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
pn_sn_features = one_dim_split(df, joint_name(cols))
pn_sn_features.head()

Column Problem Name,Step Name has 74835 values, so we will add as many columns to this dataframe.


Unnamed: 0,"1PTB02,-160 = -40X","1PTB02,-400 = -40X","1PTB02,-40A = -160","1PTB02,-40A = -400","1PTB02,-40A+400 = 0","1PTB02,-40A+400 = 240","1PTB02,-40B = -160","1PTB02,-40B = -400","1PTB02,-40B+400 = 0","1PTB02,-40B+400 = 240",...,"YLT3X+5,XIntercept1","YLT3X+5,YCoordinate1_1","YLT3X+5,YCoordinate1_2","YLT3X+5,YIntercept1","YLT3X+5,xMax","YLT3X+5,xMin","YLT3X+5,y = 0+5","YLT3X+5,y = 3*0+5","YLT3X+5,yMax","YLT3X+5,yMin"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Hyper-dimensional Features

In [19]:
# Student Name and Problem Name and Section Name
cols = ["Anon Student Id", "Problem Name", "Problem Section"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
sid_pu_ps_features = one_dim_split(df, joint_name(cols))
sid_pu_ps_features.head()

Column Anon Student Id,Problem Name,Problem Section has 18699 values, so we will add as many columns to this dataframe.


Unnamed: 0,"02i5jCrfQK,1PTB02,Section CTA1_14-2","02i5jCrfQK,1PTB03,Section CTA1_14-2","02i5jCrfQK,1PTB05,Section CTA1_20-1","02i5jCrfQK,1PTB07,Section CTA1_14-2","02i5jCrfQK,1PTB08,Section CTA1_20-2","02i5jCrfQK,1PTB09,Section CTA1_14-2","02i5jCrfQK,1PTFB10,Section CTA1_20-2","02i5jCrfQK,1PTFB11,Section CTA1_14-2","02i5jCrfQK,1PTFB13,Section CTA1_14-2","02i5jCrfQK,1PTFB14,Section CTA1_14-2",...,"zfYl0YECr0,PERCENT01C,Section CTA1_04-2","zfYl0YECr0,PROP03,Section CTA1_13-1","zfYl0YECr0,PROP04,Section CTA1_13-1","zfYl0YECr0,PROP05,Section CTA1_13-1","zfYl0YECr0,PROP07,Section CTA1_13-1","zfYl0YECr0,PROP08,Section CTA1_13-1","zfYl0YECr0,PROP12,Section CTA1_13-1","zfYl0YECr0,REAL13C,Section CTA1_04-2","zfYl0YECr0,REAL19B,Section CTA1_02-4","zfYl0YECr0,REAL28,Section CTA1_02-4"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Problem Unit and Problem Section and Problem Name
cols = ["Problem Unit", "Problem Section", "Problem Name"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
pu_ps_pn_features = one_dim_split(df, joint_name(cols))
pu_ps_pn_features.head()

Column Problem Unit,Problem Section,Problem Name has 1122 values, so we will add as many columns to this dataframe.


Unnamed: 0,"Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST","Unit CTA1_01,Section CTA1_01-2,BH1T08","Unit CTA1_01,Section CTA1_01-2,BH1T08A","Unit CTA1_01,Section CTA1_01-2,BH1T08B","Unit CTA1_01,Section CTA1_01-2,BH1T08C","Unit CTA1_01,Section CTA1_01-2,BH1T15","Unit CTA1_01,Section CTA1_01-2,BH1T15A","Unit CTA1_01,Section CTA1_01-2,BH1T15B","Unit CTA1_01,Section CTA1_01-2,BH1T15C","Unit CTA1_01,Section CTA1_01-2,BH1T17",...,"Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN007","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN008","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN009","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN010","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN011","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN012","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN013","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN014","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN015","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN016"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Problem Section and Problem Name and Step Name
cols = ["Problem Section", "Problem Name", "Step Name"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
ps_pn_sn_features = one_dim_split(df, joint_name(cols))
ps_pn_sn_features.head()

Column Problem Section,Problem Name,Step Name has 75405 values, so we will add as many columns to this dataframe.


Unnamed: 0,"Section CTA1_01-1,LDEMO_WKST,R1C1","Section CTA1_01-1,LDEMO_WKST,R1C2","Section CTA1_01-1,LDEMO_WKST,R2C1","Section CTA1_01-1,LDEMO_WKST,R2C2","Section CTA1_01-1,LDEMO_WKST,R3C1","Section CTA1_01-1,LDEMO_WKST,R3C2","Section CTA1_01-1,LDEMO_WKST,R4C1","Section CTA1_01-1,LDEMO_WKST,R4C2","Section CTA1_01-1,LDEMO_WKST,R5C1","Section CTA1_01-1,LDEMO_WKST,R5C2",...,"Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR30,-2.38516481/10 = x","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR30,5x^2-3x = 1","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR30,5x^2-3x-1 = 0","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR30,8.38516481/10 = x","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR30,a-factor-node","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR30,b-factor-node","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR30,c-factor-node","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR30,den1-factor-node","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR30,num1-factor-node","Section QUADRATICS-SOLVING_ES-1,EG-S-FACTOR30,num2-factor-node"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Student Name and Problem Section and Problem Name and Step Name
cols = ["Anon Student Id", "Problem Unit", "Problem Section", "Problem Name"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
sid_pu_ps_pn_features = one_dim_split(df, joint_name(cols))
sid_pu_ps_pn_features.head()

Column Anon Student Id,Problem Unit,Problem Section,Problem Name has 18699 values, so we will add as many columns to this dataframe.


Unnamed: 0,"02i5jCrfQK,Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-2,BH1T08","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-2,BH1T17C","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-2,BH1T22A","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-2,BH1T24","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-2,BH1T26A","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-2,RXMX1","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-3,BH1T19A","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-3,BH1T27B","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-3,BH1T28C",...,"zfYl0YECr0,Unit ES_02,Section ES_02-3,EG34A","zfYl0YECr0,Unit ES_02,Section ES_02-3,EG35A","zfYl0YECr0,Unit ES_02,Section ES_02-3,EG35B","zfYl0YECr0,Unit ES_02,Section ES_02-4,EG34A","zfYl0YECr0,Unit ES_02,Section ES_02-4,EG35B","zfYl0YECr0,Unit ES_02,Section ES_02-4,EG35C","zfYl0YECr0,Unit ES_02,Section ES_02-5,EG40","zfYl0YECr0,Unit ES_02,Section ES_02-6,EG40","zfYl0YECr0,Unit ES_02,Section ES_02-7,EG41","zfYl0YECr0,Unit ES_02,Section ES_02-8,EG41"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Problem Unit and Problem Section and Problem Name and Step Name
cols = ["Problem Unit", "Problem Section", "Problem Name", "Step Name"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
pu_ps_pn_sn_features = one_dim_split(df, joint_name(cols))
pu_ps_pn_sn_features.head()

Column Problem Unit,Problem Section,Problem Name,Step Name has 75405 values, so we will add as many columns to this dataframe.


Unnamed: 0,"Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R1C1","Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R1C2","Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R2C1","Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R2C2","Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R3C1","Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R3C2","Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R4C1","Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R4C2","Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R5C1","Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R5C2",...,"Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN016,ALGEBRAIC-TRANSFORMATIONS-refl-v","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN016,CurrFunctionText","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN016,GRAPHICAL-TRANSFORMATIONS-a","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN016,GRAPHICAL-TRANSFORMATIONS-h","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN016,GRAPHICAL-TRANSFORMATIONS-refl-v","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN016,ParentCurveChoice","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN016,ParentDescChoice","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN016,ParentFnChoice","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN016,TransformButton-a","Unit QuadLinearFuncTrans,Section QLFuncTrans9,TRANSFORMN016,TransformButton-h"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Student Name and Problem Section
cols = ["Anon Student Id", "Problem Section"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
sid_ps_features = one_dim_split(df, joint_name(cols))
sid_ps_features.head()

Column Anon Student Id,Problem Section has 4760 values, so we will add as many columns to this dataframe.


Unnamed: 0,"02i5jCrfQK,Section CTA1_01-1","02i5jCrfQK,Section CTA1_01-2","02i5jCrfQK,Section CTA1_01-3","02i5jCrfQK,Section CTA1_02-1","02i5jCrfQK,Section CTA1_02-2","02i5jCrfQK,Section CTA1_02-3","02i5jCrfQK,Section CTA1_02-4","02i5jCrfQK,Section CTA1_04-1","02i5jCrfQK,Section CTA1_04-2","02i5jCrfQK,Section CTA1_06-1",...,"zfYl0YECr0,Section ES_01-7","zfYl0YECr0,Section ES_01-8","zfYl0YECr0,Section ES_02-1","zfYl0YECr0,Section ES_02-2","zfYl0YECr0,Section ES_02-3","zfYl0YECr0,Section ES_02-4","zfYl0YECr0,Section ES_02-5","zfYl0YECr0,Section ES_02-6","zfYl0YECr0,Section ES_02-7","zfYl0YECr0,Section ES_02-8"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Student Name and Problem Name
cols = ["Anon Student Id", "Problem Name"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
sid_pn_features = one_dim_split(df, joint_name(cols))
sid_pn_features.head()

Column Anon Student Id,Problem Name has 16236 values, so we will add as many columns to this dataframe.


Unnamed: 0,"02i5jCrfQK,1PTB02","02i5jCrfQK,1PTB03","02i5jCrfQK,1PTB05","02i5jCrfQK,1PTB07","02i5jCrfQK,1PTB08","02i5jCrfQK,1PTB09","02i5jCrfQK,1PTFB10","02i5jCrfQK,1PTFB11","02i5jCrfQK,1PTFB13","02i5jCrfQK,1PTFB14",...,"zfYl0YECr0,PERCENT01C","zfYl0YECr0,PROP03","zfYl0YECr0,PROP04","zfYl0YECr0,PROP05","zfYl0YECr0,PROP07","zfYl0YECr0,PROP08","zfYl0YECr0,PROP12","zfYl0YECr0,REAL13C","zfYl0YECr0,REAL19B","zfYl0YECr0,REAL28"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Student Name and Step Name
cols = ["Anon Student Id", "Step Name"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
sid_sn_features = one_dim_split(df, joint_name(cols))
sid_sn_features.head()

Column Anon Student Id,Step Name has 91699 values, so we will add as many columns to this dataframe.


Unnamed: 0,"02i5jCrfQK,(-1+8.54400375)/2 = x","02i5jCrfQK,(-1+sqrt(1-(-72)))/2 = x","02i5jCrfQK,(-1+sqrt(1-4*1*-18))/2 = x","02i5jCrfQK,(-1+sqrt(73))/2 = x","02i5jCrfQK,(-1-8.54400375)/2 = x","02i5jCrfQK,(-1-sqrt(1-(-72)))/2 = x","02i5jCrfQK,(-1-sqrt(1-4*1*-18))/2 = x","02i5jCrfQK,(-1-sqrt(73))/2 = x","02i5jCrfQK,(-1000+sqrt(1000000-4*-16*-1950))/-32 = x","02i5jCrfQK,(-1000+sqrt(1000000-4*-16*50))/-32 = x",...,"zfYl0YECr0,y/-3 = 1","zfYl0YECr0,y/-3-9 = -8","zfYl0YECr0,y/-3-9+9 = -8+9","zfYl0YECr0,y/-3-9+9 = 1","zfYl0YECr0,y/9 = 1","zfYl0YECr0,y/9-5 = -4","zfYl0YECr0,y/9-5+5 = -4+5","zfYl0YECr0,yMax","zfYl0YECr0,yMin","zfYl0YECr0,yScale"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Student Name and Problem Unit and Problem Section and Problem Name and Step Name
cols = ["Anon Student Id", "Problem Unit", "Problem Section", "Problem Name", "Step Name"]
df[joint_name(cols)] = df.apply(join_columns, axis=1, args=(cols,))
sid_pu_ps_pn_sn_features = one_dim_split(df, joint_name(cols))
sid_pu_ps_pn_sn_features.head()

Column Anon Student Id,Problem Unit,Problem Section,Problem Name,Step Name has 218050 values, so we will add as many columns to this dataframe.


Unnamed: 0,"02i5jCrfQK,Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R1C1","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R1C2","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R2C1","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R2C2","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R3C1","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R3C2","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R4C1","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R4C2","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R5C1","02i5jCrfQK,Unit CTA1_01,Section CTA1_01-1,LDEMO_WKST,R5C2",...,"zfYl0YECr0,Unit ES_02,Section ES_02-7,EG41,y/-3-9+9 = -8+9","zfYl0YECr0,Unit ES_02,Section ES_02-7,EG41,y/-3-9+9 = 1","zfYl0YECr0,Unit ES_02,Section ES_02-8,EG41,(y/9)/(1/9) = 1/(1/9)","zfYl0YECr0,Unit ES_02,Section ES_02-8,EG41,5 = x/10","zfYl0YECr0,Unit ES_02,Section ES_02-8,EG41,5/(1/10) = (x/10)/(1/10)","zfYl0YECr0,Unit ES_02,Section ES_02-8,EG41,8 = x/10+3","zfYl0YECr0,Unit ES_02,Section ES_02-8,EG41,8-3 = x/10+3-3","zfYl0YECr0,Unit ES_02,Section ES_02-8,EG41,y/9 = 1","zfYl0YECr0,Unit ES_02,Section ES_02-8,EG41,y/9-5 = -4","zfYl0YECr0,Unit ES_02,Section ES_02-8,EG41,y/9-5+5 = -4+5"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
# Student Name and KCs
def join_columns_with_kcs(row, cols):
    base = ",".join([str(row[col]) for col in cols])
    return "~~".join([base + "," + s for s in str(row["KC(Default)"]).split("~~")])
df["Anon Student Id,KC(Default)"] = df.apply(join_columns_with_kcs, axis=1, args=(["Anon Student Id"],))

### Knowledge Components

In [28]:
def get_kc_dummies(row, headers, col):
    tokens = row[col].split("~~")
    opps = np.asarray([int(s) if s.lower() != "nan" else 0 for s in row["Opportunity(Default)"].split("~~")])
    opps = np.log(opps + 1)
    sr = pd.Series(np.zeros(len(headers)), index = headers)
    sr[tokens] = opps
    return sr.to_sparse(fill_value=0)

In [29]:
compound_KCs = set(df["KC(Default)"])
print("KC(Default) has %d compound values." % len(compound_KCs))
KCs = set()
for s in compound_KCs:
    KCs = KCs.union(set(s.split("~~")))

print("There are %d atomic KCs, so we will be adding as many columns to this dataframe" % len(KCs))

# Cast and split columns
kc_features = df.apply(get_kc_dummies, axis=1, result_type="expand", args=(KCs, "KC(Default)"))
kc_features.head()

KC(Default) has 349 compound values.
There are 111 atomic KCs, so we will be adding as many columns to this dataframe


Unnamed: 0,[SkillRule: invert-mult; {ivm}],"Positive Constants, SIF","Write expression, ratio",Shading GLF equation with positive slope,Edit Algebraic a,"Find X, positive slope","[SkillRule: Eliminate Parens; {CLT nested; CLT nested, parens; Distribute Mult right; Distribute Mult left; (+/-x +/-a)/b=c, mult; (+/-x +/-a)*b=c, div; [var expr]/[const expr] = [const expr], multiply; Distribute Division left; Distribute Division right; Distribute both mult left; Distribute both mult right; Distribute both divide left; Distribute both divide right; Distribute subex}]",Shading SIF equation with negative slope,Entering a computed linear value,Placing coordinate point,...,"[SkillRule: ax+b=c, negative; ax+b=c, negative]",Choose Graphical a,"[SkillRule: Consolidate vars, no coeff; CLT]",Including the line when shading,Using small numbers,"Convert unit, multiplier",[SkillRule: Select Multiply; {MT; MT no fraction coeff}],"Find Y, negative slope","Entering x-intercept, SIF",Using difficult numbers
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.693147
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
compound_KCs = set(df["Anon Student Id,KC(Default)"])
print("Anon Student Id,KC(Default) has %d compound values." % len(compound_KCs))
KCs = set()
for s in compound_KCs:
    KCs = KCs.union(set(s.split("~~")))

print("There are %d atomic KCs, so we will be adding as many columns to this dataframe" % len(KCs))

# Cast and split columns
sid_kc_features = df.apply(get_kc_dummies, axis=1, result_type="expand", args=(KCs, "Anon Student Id,KC(Default)"))
sid_kc_features.head()

Anon Student Id,KC(Default) has 9539 compound values.
There are 7255 atomic KCs, so we will be adding as many columns to this dataframe


### Numerical Features

In [30]:
# Problem View
def norm_problem_view(row):
    return math.log(int(row["Problem View"]) + 1)
pv_features = df.apply(norm_problem_view, axis=1)

In [31]:
# Opportunity
def norm_opportunity(row):
    opps = np.asarray([int(s) if s.lower() != "nan" else 0 for s in row["Opportunity(Default)"].split("~~")])
    return math.log(np.min(opps) + 1)
opp_features = df.apply(norm_opportunity, axis=1)

In [32]:
numerical_features = pd.concat((pv_features, opp_features), axis=1)

## Aggregation and Split

In [33]:
from scipy import sparse
features = (pu_features, ps_features
    , sn_features, sid_features, kc_features
    , numerical_features
    , sid_pu_features, pu_ps_features
    , ps_pn_features, pn_sn_features
    , sid_pu_ps_features, pu_ps_pn_features
    , ps_pn_sn_features, sid_pu_ps_pn_features
    , pu_ps_pn_sn_features
    , sid_ps_features, sid_pn_features
    , sid_sn_features, sid_pu_ps_pn_sn_features
)
aggdf = sparse.hstack([f.to_sparse(fill_value=0).to_coo().tocsr() for f in features], format="csr", dtype=float)

In [34]:
# Split training set and testing set
X = aggdf[0:len(traindf)]
Y = traindf["Correct First Attempt"]
print("Table X:", X.shape)
print("Table Y:", Y.shape)

Table X: (232744, 658691)
Table Y: (232744,)


In [35]:
X_ = aggdf[len(traindf):]
Y_ = testdf["Correct First Attempt"]
print("Table X_:", X_.shape)
print("Table Y_:", Y_.shape)

Table X_: (1140, 658691)
Table Y_: (1140,)


# Machine Learning

## Logistic Classification

In [36]:
from sklearn import linear_model
lr_model = linear_model.LogisticRegression(solver="liblinear", n_jobs=-1, max_iter=1000) # Enable all CPUs

In [37]:
lr_model.fit(X, Y)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

## Random Forest

In [322]:
from sklearn import ensemble
est_count = 100
rf_model = ensemble.RandomForestClassifier(n_estimators = est_count, criterion="entropy", max_depth=7, n_jobs=-1) # Enable all CPUs

In [323]:
rf_model.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

## AdaBoost

In [80]:
from sklearn import ensemble
from sklearn import linear_model
ab_model = ensemble.AdaBoostClassifier(base_estimator=linear_model.LogisticRegression(solver="saga", n_jobs=-1, max_iter=1000))

In [341]:
ab_model.fit(X, Y)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=LogisticRegression(C=1.0, class_weight=None,
                                                     dual=False,
                                                     fit_intercept=True,
                                                     intercept_scaling=1,
                                                     l1_ratio=None,
                                                     max_iter=1000,
                                                     multi_class='warn',
                                                     n_jobs=-1, penalty='l2',
                                                     random_state=None,
                                                     solver='saga', tol=0.0001,
                                                     verbose=0,
                                                     warm_start=False),
                   learning_rate=1.0, n_estimators=50, random_state=None)

# Error Testing

In [39]:
# Root Mean Squared Error
# Here, we consider using numpy as a powerful
# utility to solve the RMSE
def RMSE(P, Y):
    P = P[~np.isnan(Y)]
    Y = Y[~np.isnan(Y)]
    return np.sqrt(np.sum(np.square(P - Y)) / len(Y))

In [40]:
# Logistic Classification

P = lr_model.predict_proba(X)[:, 1]
P_ = lr_model.predict_proba(X_)[:, 1]

print("Train Error:", RMSE(P, Y))
print("Test Error:", RMSE(P_, Y_))

Train Error: 0.22914622805505377
Test Error: 0.3443998310806608


In [54]:
# Generate submission
RES = P_[np.isnan(Y_)]
testdf.loc[np.isnan(testdf["Correct First Attempt"]), "Correct First Attempt"] = RES
testdf.to_csv("data/predict_result.csv", sep='\t', index=False)

In [40]:
# Save
pd.Series(P).to_csv("data/sparse_train.csv", sep='\t', header=["sparse_res"])
pd.Series(P_).to_csv("data/sparse_test.csv", sep='\t', header=["sparse_res"])

In [324]:
# Random Forest

P = rf_model.predict_proba(X)[:, 1]
P_ = rf_model.predict_proba(X_)[:, 1]

print("Train Error:", RMSE(P, Y))
print("Test Error:", RMSE(P_, Y_))

Train Error: 0.4114884197701772
Test Error: 0.3910843073433951


In [342]:
# AdaBoost

P = ab_model.predict_proba(X)[:, 1]
P_ = ab_model.predict_proba(X_)[:, 1]

print("Train Error:", RMSE(P, Y))
print("Test Error:", RMSE(P_, Y_))

Train Error: 0.49575808621765616
Test Error: 0.4954684100106574
