In [91]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import precision_score, recall_score, classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [92]:
events_data = pd.read_csv("event_data_train.csv")
submissions_data = pd.read_csv("submissions_data_train.csv")

In [93]:
events_data['discovered'] = events_data['action'].apply(lambda action: 1 if (action == 'discovered') else 0)
events_data['passed'] = events_data['action'].apply(lambda action: 1 if (action == 'passed') else 0)
events_data['viewed'] = events_data['action'].apply(lambda action: 1 if (action == 'viewed') else 0)
events_data['started_attempt'] = events_data['action'].apply(lambda action: 1 if (action == 'started_attempt') else 0)
events_data.head()

Unnamed: 0,step_id,timestamp,action,user_id,discovered,passed,viewed,started_attempt
0,32815,1434340848,viewed,17632,0,0,1,0
1,32815,1434340848,passed,17632,0,1,0,0
2,32815,1434340848,discovered,17632,1,0,0,0
3,32811,1434340895,discovered,17632,1,0,0,0
4,32811,1434340895,viewed,17632,0,0,1,0


In [94]:
learning_time = 2*24*60*60
user_time = events_data.groupby('user_id', as_index=False).agg({'timestamp':'min'})
user_time['end_time'] = user_time['timestamp'] + learning_time
user_time=user_time.drop('timestamp', axis=1)
user_time.head()

Unnamed: 0,user_id,end_time
0,1,1473000264
1,2,1514556164
2,3,1434531276
3,5,1466329609
4,7,1521807460


In [95]:
events_data=events_data.merge(user_time[['user_id','end_time']], how='outer')
events_data = events_data[events_data['timestamp']<=events_data['end_time']]

In [96]:
data = events_data.groupby('user_id', as_index=False)\
.agg({'discovered':'sum','passed':'sum','viewed':'sum','started_attempt':'sum','timestamp':'min'})
data.head()

Unnamed: 0,user_id,discovered,passed,viewed,started_attempt,timestamp
0,1,1,0,1,0,1472827464
1,2,9,9,9,2,1514383364
2,3,15,15,20,4,1434358476
3,5,1,1,1,0,1466156809
4,7,1,1,1,0,1521634660


In [97]:
submissions_data['correct'] = submissions_data['submission_status']\
.apply(lambda action: 1 if (action == 'correct') else 0)
submissions_data['wrong'] = submissions_data['submission_status']\
.apply(lambda action: 1 if (action == 'wrong') else 0)
submissions_data.head()

Unnamed: 0,step_id,timestamp,submission_status,user_id,correct,wrong
0,31971,1434349275,correct,15853,1,0
1,31972,1434348300,correct,15853,1,0
2,31972,1478852149,wrong,15853,0,1
3,31972,1478852164,correct,15853,1,0
4,31976,1434348123,wrong,15853,0,1


In [98]:
learning_time = 2*24*60*60
user_sub_time = submissions_data.groupby('user_id', as_index=False).agg({'timestamp':'min'})
user_sub_time['end_time'] = user_sub_time['timestamp'] + learning_time
user_sub_time=user_sub_time.drop('timestamp', axis=1)
user_sub_time.head()

Unnamed: 0,user_id,end_time
0,2,1514556220
1,3,1434531333
2,5,1500032450
3,8,1480776232
4,14,1436541401


In [99]:
submissions_data=submissions_data.merge(user_sub_time[['user_id','end_time']], how='outer')
submissions_data = submissions_data[submissions_data['timestamp']<=submissions_data['end_time']]
submissions_data.head()

Unnamed: 0,step_id,timestamp,submission_status,user_id,correct,wrong,end_time
0,31971,1434349275,correct,15853,1,0,1434518856
1,31972,1434348300,correct,15853,1,0,1434518856
4,31976,1434348123,wrong,15853,0,1,1434518856
5,31976,1434348188,correct,15853,1,0,1434518856
7,31977,1434347371,correct,15853,1,0,1434518856


In [100]:
data_sub = submissions_data.groupby('user_id', as_index=False)\
.agg({'correct':'sum','wrong':'sum','timestamp':'min'})
data_sub.head()

Unnamed: 0,user_id,correct,wrong,timestamp
0,2,2,0,1514383420
1,3,4,4,1434358533
2,5,2,2,1499859650
3,8,9,21,1480603432
4,14,0,1,1436368601


In [106]:
data_sub = data_sub.drop('timestamp', axis=1)
data = data.merge(data_sub[['user_id','correct', 'wrong']], how='outer')

In [109]:
data = data.fillna(0)
data.head(10)

Unnamed: 0,user_id,discovered,passed,viewed,started_attempt,timestamp,correct,wrong
0,1,1,0,1,0,1472827464,0.0,0.0
1,2,9,9,9,2,1514383364,2.0,0.0
2,3,15,15,20,4,1434358476,4.0,4.0
3,5,1,1,1,0,1466156809,2.0,2.0
4,7,1,1,1,0,1521634660,0.0,0.0
5,8,109,84,154,37,1480603427,9.0,21.0
6,9,3,3,4,0,1484999434,0.0,0.0
7,11,1,0,1,1,1526140624,0.0,0.0
8,14,4,3,9,1,1436366155,0.0,1.0
9,16,50,49,117,21,1435674765,18.0,23.0


In [110]:
data['target'] = data['passed'].apply(lambda passed: 1 if (passed > 40) else 0)
data.head(10)

Unnamed: 0,user_id,discovered,passed,viewed,started_attempt,timestamp,correct,wrong,target
0,1,1,0,1,0,1472827464,0.0,0.0,0
1,2,9,9,9,2,1514383364,2.0,0.0,0
2,3,15,15,20,4,1434358476,4.0,4.0,0
3,5,1,1,1,0,1466156809,2.0,2.0,0
4,7,1,1,1,0,1521634660,0.0,0.0,0
5,8,109,84,154,37,1480603427,9.0,21.0,1
6,9,3,3,4,0,1484999434,0.0,0.0,0
7,11,1,0,1,1,1526140624,0.0,0.0,0
8,14,4,3,9,1,1436366155,0.0,1.0,0
9,16,50,49,117,21,1435674765,18.0,23.0,1


In [111]:
y = data['target']
data = data.drop('target', axis=1)
X = data.drop('user_id', axis=1)

In [114]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
tree = DecisionTreeClassifier()
param_grid = [{'max_depth':[1,2,3,4,5,6,7,8,9,10],
              'min_samples_split':[2,3,4,5,6,7,8,9,10],
              'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10]}]
search = GridSearchCV(estimator=tree, param_grid=param_grid, n_jobs=-1, verbose=1)
search.fit(x_train,y_train)
best_tree = search.best_estimator_



Fitting 3 folds for each of 900 candidates, totalling 2700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 2280 tasks      | elapsed:   42.0s
[Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed:   43.7s finished


In [115]:
best_tree = search.best_estimator_
y_pred = best_tree.predict(x_test)
search.best_score_

1.0

In [117]:
best_tree

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [121]:
x_test

Unnamed: 0,discovered,passed,viewed,started_attempt,timestamp,correct,wrong
200,118,108,206,0,1468666610,0.0,0.0
9760,7,6,40,1,1508433315,0.0,0.0
2255,26,26,111,9,1520690718,9.0,0.0
7066,1,1,1,0,1495710905,0.0,0.0
10001,1,0,1,1,1522925933,0.0,0.0
6110,47,46,60,20,1494928750,15.0,24.0
10880,23,23,41,8,1436200826,8.0,6.0
6335,1,0,1,0,1478950654,0.0,0.0
17067,10,6,12,0,1455188992,0.0,0.0
2140,1,0,1,1,1524891246,0.0,0.0
