# Machine Learning

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from utils import *

## Data Loading

### for separte data

In [None]:
# Get the basic data
train_filepath = 'data/train.csv'
traindf = pd.read_table(train_filepath)

In [None]:
# CFAR features
train_filepath = 'data/CFAR_features_train.csv'
train_cfar = pd.read_table(train_filepath)[CFAR_features]

In [None]:
# CFAREX features
train_filepath = 'data/cfarex_features_train.csv'
train_cfarex = pd.read_table(train_filepath)[CFAREX_features]

In [None]:
# Ability features
train_filepath = 'data/ability_features_train.csv'
train_abt = pd.read_table(train_filepath)[ability_features]

### for aggregate data

In [205]:
train_filepath = 'data/agg_train.csv'
traindf = pd.read_table(train_filepath)

  


In [5]:
def f_norm(row, col):
    if isinstance(row[col], int) or isinstance(row[col], float):
        return row[col] / (row[col] + 1)
    else:
        return 0.0

traindf["Problem View(Norm)"] = traindf.apply(f_norm, axis="columns", args=("Problem View",))
traindf["Opportunity(Norm)"] = traindf.apply(f_norm, axis="columns", args=("Opportunity(Default)",))

In [118]:
# Concat
traindf = pd.concat([traindf, train_cfar, train_cfarex, train_abt], axis="columns")

In [206]:
traindf.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,...,"CFA | Anon Student Id,Step Name",ability from KC and Frequency,ability from KC and Hints,Date and Time,Date,KC History Today,KC History Yesterday,KC History Week,Unit,"CFA | Anon Student Id,Unit"
0,0,0,0,9938,52vEY7f17k,"Unit CTA1_13, Section CTA1_13-1",PROP04,1,R1C1,2005-09-09 12:23:34.0,...,0.949153,0.049317,0.77384,2005-09-09 12:23:34,2005-09-09 00:00:00,19,0.0,0.0,Unit CTA1_13,0.762376


In [278]:
# CFAR_features = [ nameOfCFAR(v) for v in [
#     ["Anon Student Id"], ["Step Name"], ["Problem Name"], ["KC(Default)"]
#     , ["Anon Student Id", "Problem Name"], 
# ]]
# CFAREX_features = [ nameOfCFAR(v) for v in [
#     ["Problem Name", "Step Name"]
#     , ["Anon Student Id", "Step Name"]
# ]]
CFAR_features = [ nameOfCFAR(v) for v in [
    ["Anon Student Id"],["Anon Student Id", "Unit"]
]]
CFAREX_features = [ nameOfCFAR(v) for v in [
     ["Anon Student Id", "Step Name"]
]]
ability_features = ["ability from KC and Frequency", "ability from KC and Hints"]
temporal_features = ['KC History Today','KC History Yesterday']
numerical_features = ["Problem View(Norm)", "Opportunity(Norm)"]
features = []
features+=CFAR_features 
# features+= CFAREX_features
features+= numerical_features 
features += temporal_features
# features += ability_features
labels = ["Correct First Attempt"]

In [279]:
traindf = traindf.dropna(axis="index", subset=features+labels)
X = np.array(traindf[features])
Y = np.array(traindf[labels]).astype(float).ravel()

## Decision Tree

In [201]:
from sklearn import tree
dt_model = tree.DecisionTreeClassifier(max_depth=10)

In [202]:
dt_model = dt_model.fit(X, Y)

## Random Forest

In [280]:
from sklearn import ensemble
est_count = 100
rf_model = ensemble.RandomForestClassifier(n_estimators = est_count, criterion="entropy", max_depth=12)

In [281]:
rf_model = rf_model.fit(X, Y)

## Logistic Regression

In [294]:
from sklearn import linear_model
lr_model = linear_model.LogisticRegression()

In [295]:
lr_model = lr_model.fit(X, Y)



# Testing

In [298]:
# Root Mean Squared Error
# Here, we consider using numpy as a powerful
# utility to solve the RMSE
def RMSE(P, Y):
    return np.sqrt(np.sum(np.square(P - Y)) / len(Y))

# Data Loading

In [209]:
# Get the basic test data
test_filepath = 'data/agg_test.csv'
testdf = pd.read_table(test_filepath)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# CFAR features
test_filepath = 'data/CFAR_features_test.csv'
test_cfar = pd.read_table(test_filepath)[CFAR_features]

In [None]:
# CFAREX features
test_filepath = 'data/cfarex_features_train.csv'
test_cfarex = pd.read_table(test_filepath)[CFAREX_features]

In [None]:
# Ability features
test_filepath = 'data/ability_features_test.csv'
test_abt = pd.read_table(test_filepath)[ability_features]

In [None]:
testdf["Problem View(Norm)"] = testdf.apply(f_norm, axis="columns", args=("Problem View",))
testdf["Opportunity(Norm)"] = testdf.apply(f_norm, axis="columns", args=("Opportunity(Default)",))

In [None]:
# Concat
testdf = pd.concat([testdf, test_cfar, test_cfarex, test_abt], axis="columns")

In [299]:
# Drop unknown values
testdf = testdf.dropna(axis="index", subset=features+labels)
# Extract used columns(features)
X_ = np.array(testdf[features])
Y_ = np.array(testdf[labels]).astype(float).ravel()

In [300]:
# P = rf_model.predict_proba(X_)
P = lr_model.predict_proba(X_)
# P = dt_model.predict_proba(X_)
# P = np.min(P, axis=1)
P = P[:, 1]
print(RMSE(P, Y_))

importances = rf_model.feature_importances_
n_feats = len(features)
feat_std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("\nFeature ranking:")

for f in range(n_feats):
    print('{0:2} - {1:20}: {2:5.4f} (std: {3:5.4f})'.format(
        f+1
        , features[indices[f]]
        , importances[indices[f]]
        , feat_std[indices[f]]
    ))

0.39716456811411466

Feature ranking:
 1 - CFA | Anon Student Id,Unit: 0.3573 (std: 0.0482)
 2 - KC History Today    : 0.2617 (std: 0.0171)
 3 - Opportunity(Norm)   : 0.1990 (std: 0.0125)
 4 - CFA | Anon Student Id: 0.1362 (std: 0.0379)
 5 - KC History Yesterday: 0.0458 (std: 0.0041)


In [284]:
rf_model.score(X_,Y_)

0.7936507936507936

In [285]:
rf_model.score(X,Y)

0.7979969408448768