# Machine Learning

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from utils import *

## Data Loading

In [460]:
train_filepath = 'data/agg_train.csv'
traindf = pd.read_table(train_filepath)

test_filepath = 'data/agg_test_666.csv'
testdf = pd.read_table(test_filepath)


  
  """


## For ensemble, we load result from sparse model

In [504]:
sparse_train_filepath = 'data/sparse_train.csv'
sparse_train = pd.read_table(sparse_train_filepath)

sparse_test_filepath = 'data/sparse_test.csv'
sparse_test = pd.read_table(sparse_test_filepath)

  
  """


In [505]:
traindf['sparse_res'] = sparse_train['sparse_res']
testdf['sparse_res'] = sparse_test['sparse_res']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [541]:
# CFAR_features = [ nameOfCFAR(v) for v in [
#     ["Anon Student Id"], ["Step Name"], ["Problem Name"], ["KC(Default)"]
#     , ["Anon Student Id", "Problem Name"],["Anon Student Id","KC(Default)"]
# ]]
# CFAREX_features = [ nameOfCFAR(v) for v in [
#     ["Problem Name", "Step Name"]
#     , ["Anon Student Id", "Step Name"]
# ]]
# ability_features = ["ability from KC and Frequency", "ability from KC and Hints"]
CFAR_features = [ nameOfCFAR(v) for v in [
    ["Anon Student Id"],["Anon Student Id", "Unit"]
]]
ability_features = ["ability from KC and Frequency",  "ability from KC and Hints"]
temporal_features = ['KC History Today','KC History Yesterday']
numerical_features = ["Problem View(Norm)", "Opportunity(Norm)"]
sparse_features = ['sparse_res']
features = []
features+=CFAR_features 

features+= numerical_features 
features += temporal_features
features += ability_features
features += sparse_features
labels = ["Correct First Attempt"]

In [542]:
traindf = traindf.dropna(axis="index", subset=features+labels)
X = np.array(traindf[features])
Y = np.array(traindf[labels]).astype(float).ravel()

# Different Models

## Decision Tree

In [560]:
from sklearn import tree
dt_model = tree.DecisionTreeClassifier(max_depth=1)

In [561]:
dt_model = dt_model.fit(X, Y)

## Random Forest

In [550]:
from sklearn import ensemble
est_count = 100
rf_model = ensemble.RandomForestClassifier(n_estimators = est_count, criterion="entropy", max_depth=3)

In [551]:
rf_model = rf_model.fit(X, Y)

## Logistic Regression

In [514]:
from sklearn import linear_model
lr_model = linear_model.LogisticRegression()

In [515]:
lr_model = lr_model.fit(X, Y)



# Testing

In [298]:
# Root Mean Squared Error
# Here, we consider using numpy as a powerful
# utility to solve the RMSE
def RMSE(P, Y):
    return np.sqrt(np.sum(np.square(P - Y)) / len(Y))

# Data Loading

In [371]:
# Get the basic test data
test_filepath = 'data/agg_test.csv'
testdf = pd.read_table(test_filepath)

  This is separate from the ipykernel package so we can avoid doing imports until


In [552]:
# Drop unknown values
testdf = testdf.dropna(axis="index", subset=features+labels)
# Extract used columns(features)
X_ = np.array(testdf[features])
Y_ = np.array(testdf[labels]).astype(float).ravel()

In [562]:
# P = rf_model.predict_proba(X_)
# P = lr_model.predict_proba(X_)
P = dt_model.predict_proba(X_)
# P = np.min(P, axis=1)
P = P[:, 1]
print(RMSE(P, Y_))

importances = rf_model.feature_importances_
n_feats = len(features)
feat_std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("\nFeature ranking:")

for f in range(n_feats):
    print('{0:2} - {1:20}: {2:5.4f} (std: {3:5.4f})'.format(
        f+1
        , features[indices[f]]
        , importances[indices[f]]
        , feat_std[indices[f]]
    ))

0.3912513808354022

Feature ranking:
 1 - sparse_res          : 0.7862 (std: 0.2875)
 2 - ability from KC and Hints: 0.1575 (std: 0.2238)
 3 - KC History Today    : 0.0217 (std: 0.0525)
 4 - CFA | Anon Student Id,Unit: 0.0209 (std: 0.0421)
 5 - ability from KC and Frequency: 0.0083 (std: 0.0208)
 6 - CFA | Anon Student Id: 0.0051 (std: 0.0195)
 7 - Opportunity(Norm)   : 0.0004 (std: 0.0012)
 8 - Problem View(Norm)  : 0.0000 (std: 0.0001)
 9 - KC History Yesterday: 0.0000 (std: 0.0000)


In [416]:
rf_model.score(X_,Y_)

0.8063492063492064

In [405]:
rf_model.score(X,Y)

0.8333477172693536