# Machine Learning

In [5]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from utils import *

In [311]:
CFAR_features = [ nameOfCFAR(v) for v in [
    ["Anon Student Id"], ["Step Name"], ["Problem Name"], ["KC(Default)"]
    , ["Anon Student Id", "Problem Name"], ["Anon Student Id", "KC(Default)"]
]]
CFAREX_features = [ nameOfCFAR(v) for v in [
    ["Problem Name", "Step Name"]
    , ["Anon Student Id", "Step Name"]
]]
ability_features = ["ability from KC and Frequency", "ability from KC and Hints"]
numerical_features = ["Problem View(Norm)", "Opportunity(Norm)"]
features = CFAR_features + CFAREX_features + numerical_features
labels = ["Correct First Attempt"]

## Data Loading

In [312]:
# Get the basic data
train_filepath = 'data/train.csv'
traindf = pd.read_table(train_filepath)

  This is separate from the ipykernel package so we can avoid doing imports until


In [313]:
# CFAR features
train_filepath = 'data/CFAR_features_train.csv'
train_cfar = pd.read_table(train_filepath)[CFAR_features]

  This is separate from the ipykernel package so we can avoid doing imports until


In [314]:
# CFAREX features
train_filepath = 'data/cfarex_features_train.csv'
train_cfarex = pd.read_table(train_filepath)[CFAREX_features]

  This is separate from the ipykernel package so we can avoid doing imports until


In [315]:
# Ability features
train_filepath = 'data/ability_features_train.csv'
train_abt = pd.read_table(train_filepath)[ability_features]

  This is separate from the ipykernel package so we can avoid doing imports until


In [316]:
def f_norm(row, col):
    if isinstance(row[col], int) or isinstance(row[col], float):
        return row[col] / (row[col] + 1)
    else:
        return 0.0

traindf["Problem View(Norm)"] = traindf.apply(f_norm, axis="columns", args=("Problem View",))
traindf["Opportunity(Norm)"] = traindf.apply(f_norm, axis="columns", args=("Opportunity(Default)",))

In [317]:
# Concat
traindf = pd.concat([traindf, train_cfar, train_cfarex, train_abt], axis="columns")

In [318]:
traindf.head(100)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,...,CFA | Anon Student Id,CFA | Step Name,CFA | Problem Name,CFA | KC(Default),"CFA | Anon Student Id,Problem Name","CFA | Anon Student Id,KC(Default)","CFA | Problem Name,Step Name","CFA | Anon Student Id,Step Name",ability from KC and Frequency,ability from KC and Hints
0,9938,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R1C1,2005-09-09 12:23:34.0,2005-09-09 12:24:07.0,2005-09-09 12:24:07.0,2005-09-09 12:24:07.0,...,0.748749,0.840631,0.710197,0.000000,0.666667,0.000000,0.875000,0.949153,0.049317,0.773840
1,9939,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R1C2,2005-09-09 12:24:07.0,2005-09-09 12:24:22.0,2005-09-09 12:24:22.0,2005-09-09 12:24:22.0,...,0.748749,0.830699,0.710197,0.000000,0.666667,0.000000,0.638298,0.931034,0.049317,0.773840
2,9940,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R3C1,2005-09-09 12:24:22.0,2005-09-09 12:25:16.0,2005-09-09 12:25:40.0,2005-09-09 12:25:40.0,...,0.748749,0.966979,0.710197,0.966979,0.666667,0.965517,0.888889,0.965517,0.031347,0.950820
3,9941,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R3C2,2005-09-09 12:25:40.0,2005-09-09 12:25:51.0,2005-09-09 12:27:24.0,2005-09-09 12:27:24.0,...,0.748749,0.404477,0.710197,0.443541,0.666667,0.413793,0.382979,0.377049,0.016603,0.285898
4,9942,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R4C1,2005-09-09 12:27:24.0,2005-09-09 12:27:30.0,2005-09-09 12:27:30.0,2005-09-09 12:27:30.0,...,0.748749,0.760107,0.710197,0.817953,0.666667,0.815287,0.739130,0.697674,0.007059,0.592248
5,9943,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R5C1,2005-09-09 12:27:30.0,2005-09-09 12:27:41.0,2005-09-09 12:27:41.0,2005-09-09 12:27:41.0,...,0.748749,0.755559,0.710197,0.817953,0.666667,0.815287,0.955556,0.642857,0.007059,0.592248
6,9944,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R6C1,2005-09-09 12:27:41.0,2005-09-09 12:28:41.0,2005-09-09 12:28:41.0,2005-09-09 12:28:41.0,...,0.748749,0.694107,0.710197,0.817953,0.666667,0.815287,0.822222,0.738095,0.007059,0.592248
7,9945,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R7C2,2005-09-09 12:28:50.0,2005-09-09 12:28:58.0,2005-09-09 12:28:58.0,2005-09-09 12:28:58.0,...,0.748749,0.757615,0.710197,0.817953,0.666667,0.815287,0.888889,0.814815,0.007059,0.592248
8,9946,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,ValidEquations,2005-09-09 12:31:39.0,2005-09-09 12:31:52.0,2005-09-09 12:31:52.0,2005-09-09 12:31:52.0,...,0.748749,0.880437,0.710197,0.000000,0.666667,0.000000,0.880952,0.763158,0.049317,0.773840
9,9947,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,7/10*X = 1400,2005-09-09 12:31:52.0,2005-09-09 12:32:28.0,2005-09-09 12:32:35.0,2005-09-09 12:32:35.0,...,0.748749,0.571429,0.710197,0.000000,0.666667,0.000000,0.571429,0.000000,0.049317,0.773840


In [319]:
traindf = traindf.dropna(axis="index", subset=features+labels)
X = np.array(traindf[features])
Y = np.array(traindf[labels]).astype(float).ravel()

## Decision Tree

In [233]:
from sklearn import tree
dt_model = tree.DecisionTreeClassifier(max_depth=3)

In [234]:
dt_model = dt_model.fit(X, Y)

## Random Forest

In [338]:
from sklearn import ensemble
est_count = 100
rf_model = ensemble.RandomForestClassifier(n_estimators = est_count, criterion="entropy", max_depth=200)

In [339]:
rf_model = rf_model.fit(X, Y)

## Logistic Regression

In [264]:
from sklearn import linear_model
lr_model = linear_model.LogisticRegression()

In [265]:
lr_model = lr_model.fit(X, Y)



# Testing

In [31]:
# Root Mean Squared Error
# Here, we consider using numpy as a powerful
# utility to solve the RMSE
def RMSE(P, Y):
    return np.sqrt(np.sum(np.square(P - Y)) / len(Y))

# Data Loading

In [322]:
# Get the basic test data
test_filepath = 'data/test.csv'
testdf = pd.read_table(test_filepath)

  This is separate from the ipykernel package so we can avoid doing imports until


In [323]:
# CFAR features
test_filepath = 'data/CFAR_features_test.csv'
test_cfar = pd.read_table(test_filepath)[CFAR_features]

  This is separate from the ipykernel package so we can avoid doing imports until


In [324]:
# CFAREX features
test_filepath = 'data/cfarex_features_train.csv'
test_cfarex = pd.read_table(test_filepath)[CFAREX_features]

  This is separate from the ipykernel package so we can avoid doing imports until


In [325]:
# Ability features
test_filepath = 'data/ability_features_test.csv'
test_abt = pd.read_table(test_filepath)[ability_features]

  This is separate from the ipykernel package so we can avoid doing imports until


In [326]:
testdf["Problem View(Norm)"] = testdf.apply(f_norm, axis="columns", args=("Problem View",))
testdf["Opportunity(Norm)"] = testdf.apply(f_norm, axis="columns", args=("Opportunity(Default)",))

In [327]:
# Concat
testdf = pd.concat([testdf, test_cfar, test_cfarex, test_abt], axis="columns")

In [329]:
testdf.head(100)

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,...,CFA | Anon Student Id,CFA | Step Name,CFA | Problem Name,CFA | KC(Default),"CFA | Anon Student Id,Problem Name","CFA | Anon Student Id,KC(Default)","CFA | Problem Name,Step Name","CFA | Anon Student Id,Step Name",ability from KC and Frequency,ability from KC and Hints
0,10039.0,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP01,1.0,2/300*X = 10,,,,,...,0.820012,0.755559,0.777679,0.686542,0.750000,0.631579,0.875000,0.949153,0.049317,0.773840
1,10482.0,52vEY7f17k,"Unit CTA1_06, Section CTA1_06-3",FOR05,1.0,xScale,,,,,...,0.820012,0.870237,0.885030,0.000000,0.809524,0.000000,0.638298,0.931034,0.029439,1.000000
2,11024.0,52vEY7f17k,"Unit ES_03, Section ES_03-6",EG52,1.0,2.9 = -2x+3.7+x,,,,,...,0.820012,0.818575,0.763982,0.775749,0.833333,0.904762,0.888889,0.965517,0.028058,0.640375
3,11476.0,52vEY7f17k,"Unit CTA1_08, Section CTA1_08-3",REAL10,1.0,yScale,,,,,...,0.820012,0.761905,0.741007,0.865864,0.000000,0.875000,0.382979,0.377049,0.029439,1.000000
4,11948.0,52vEY7f17k,"Unit ES_04, Section ES_04-12",EG58,7.0,xR2,,,,,...,0.820012,0.404477,0.806701,0.460606,1.000000,0.125000,0.739130,0.697674,0.049317,0.773840
5,12532.0,52vEY7f17k,"Unit CTA1_10, Section CTA1_10-5",DIST05_SP,1.0,R1C2,,,,,...,0.820012,0.769231,0.821018,0.787275,0.000000,0.827586,0.955556,0.642857,0.049317,0.773840
6,12955.0,52vEY7f17k,"Unit ES_07, Section ES_07-4",LIT69A,4.0,z*u-n*j+z*g = 16n,,,,,...,0.820012,0.445225,0.667472,0.518980,0.625000,0.666667,0.822222,0.738095,0.021182,0.355055
7,19200.0,6W08a98ZQV,"Unit ES_02, Section ES_02-8",EG41,2.0,FinalAnswer,,,,,...,0.707545,0.787045,0.823810,0.787045,1.000000,0.684211,0.888889,0.814815,0.000000,0.000000
8,19384.0,6W08a98ZQV,"Unit CTA1_13, Section CTA1_13-1",PROP10,1.0,R1C2,,,,,...,0.707545,0.000000,0.784946,0.912086,1.000000,0.982759,0.880952,0.763158,0.051550,0.709413
9,19835.0,6W08a98ZQV,"Unit CTA1_06, Section CTA1_06-3",FOR05,2.0,XLabel,,,,,...,0.707545,0.880437,0.829431,0.000000,0.777778,0.000000,0.571429,0.000000,0.023543,0.707836


In [330]:
# Drop unknown values
testdf = testdf.dropna(axis="index", subset=features+labels)
# Extract used columns(features)
X_ = np.array(testdf[features])
Y_ = np.array(testdf[labels]).astype(float).ravel()

In [340]:
P = rf_model.predict_proba(X_)
P = P[:, 1]
print(RMSE(P, Y_))

importances = rf_model.feature_importances_
n_feats = len(features)
feat_std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("\nFeature ranking:")

for f in range(n_feats):
    print('{0:2} - {1:20}: {2:5.4f} (std: {3:5.4f})'.format(
        f+1
        , features[indices[f]]
        , importances[indices[f]]
        , feat_std[indices[f]]
    ))

0.5297234430911449

Feature ranking:
 1 - CFA | Problem Name,Step Name: 0.3072 (std: 0.1163)
 2 - CFA | Anon Student Id,Step Name: 0.2335 (std: 0.1112)
 3 - CFA | Anon Student Id,Problem Name: 0.1055 (std: 0.0161)
 4 - CFA | Step Name     : 0.0999 (std: 0.0880)
 5 - CFA | Anon Student Id,KC(Default): 0.0949 (std: 0.0397)
 6 - CFA | Problem Name  : 0.0517 (std: 0.0041)
 7 - CFA | Anon Student Id: 0.0432 (std: 0.0016)
 8 - CFA | KC(Default)   : 0.0368 (std: 0.0219)
 9 - Problem View(Norm)  : 0.0274 (std: 0.0014)
10 - Opportunity(Norm)   : 0.0000 (std: 0.0000)
