In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import precision_score, recall_score, classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [26]:
events_data = pd.read_csv("event_data_train.csv")
submissions_data = pd.read_csv("submissions_data_train.csv")

In [27]:
events_data['discovered'] = events_data['action'].apply(lambda action: 1 if (action == 'discovered') else 0)
events_data['passed'] = events_data['action'].apply(lambda action: 1 if (action == 'passed') else 0)
events_data['viewed'] = events_data['action'].apply(lambda action: 1 if (action == 'viewed') else 0)
events_data['started_attempt'] = events_data['action'].apply(lambda action: 1 if (action == 'started_attempt') else 0)
events_data.head()

Unnamed: 0,step_id,timestamp,action,user_id,discovered,passed,viewed,started_attempt
0,32815,1434340848,viewed,17632,0,0,1,0
1,32815,1434340848,passed,17632,0,1,0,0
2,32815,1434340848,discovered,17632,1,0,0,0
3,32811,1434340895,discovered,17632,1,0,0,0
4,32811,1434340895,viewed,17632,0,0,1,0


In [28]:
learning_time = 2*24*60*60
user_time = events_data.groupby('user_id', as_index=False).agg({'timestamp':'min'})
user_time['end_time'] = user_time['timestamp'] + learning_time
user_time=user_time.drop('timestamp', axis=1)
user_time.head()

Unnamed: 0,user_id,end_time
0,1,1473000264
1,2,1514556164
2,3,1434531276
3,5,1466329609
4,7,1521807460


In [29]:
data_y = events_data.groupby('user_id', as_index=False)\
.agg({'passed':'sum'})
data_y['target'] = data_y['passed'].apply(lambda passed: 1 if (passed > 40) else 0)
y = data_y['target']

In [30]:
events_data=events_data.merge(user_time[['user_id','end_time']], how='outer')
events_data = events_data[events_data['timestamp']<=events_data['end_time']]

In [31]:
data = events_data.groupby('user_id', as_index=False)\
.agg({'discovered':'sum','passed':'sum','viewed':'sum','started_attempt':'sum','timestamp':'min'})
data.head()

Unnamed: 0,user_id,discovered,passed,viewed,started_attempt,timestamp
0,1,1,0,1,0,1472827464
1,2,9,9,9,2,1514383364
2,3,15,15,20,4,1434358476
3,5,1,1,1,0,1466156809
4,7,1,1,1,0,1521634660


In [32]:
submissions_data['correct'] = submissions_data['submission_status']\
.apply(lambda action: 1 if (action == 'correct') else 0)
submissions_data['wrong'] = submissions_data['submission_status']\
.apply(lambda action: 1 if (action == 'wrong') else 0)
submissions_data.head()

Unnamed: 0,step_id,timestamp,submission_status,user_id,correct,wrong
0,31971,1434349275,correct,15853,1,0
1,31972,1434348300,correct,15853,1,0
2,31972,1478852149,wrong,15853,0,1
3,31972,1478852164,correct,15853,1,0
4,31976,1434348123,wrong,15853,0,1


In [33]:
learning_time = 2*24*60*60
user_sub_time = submissions_data.groupby('user_id', as_index=False).agg({'timestamp':'min'})
user_sub_time['end_time'] = user_sub_time['timestamp'] + learning_time
user_sub_time=user_sub_time.drop('timestamp', axis=1)
user_sub_time.head()

Unnamed: 0,user_id,end_time
0,2,1514556220
1,3,1434531333
2,5,1500032450
3,8,1480776232
4,14,1436541401


In [34]:
submissions_data=submissions_data.merge(user_sub_time[['user_id','end_time']], how='outer')
submissions_data = submissions_data[submissions_data['timestamp']<=submissions_data['end_time']]
submissions_data.head()

Unnamed: 0,step_id,timestamp,submission_status,user_id,correct,wrong,end_time
0,31971,1434349275,correct,15853,1,0,1434518856
1,31972,1434348300,correct,15853,1,0,1434518856
4,31976,1434348123,wrong,15853,0,1,1434518856
5,31976,1434348188,correct,15853,1,0,1434518856
7,31977,1434347371,correct,15853,1,0,1434518856


In [36]:
data_sub = submissions_data.groupby('user_id', as_index=False)\
.agg({'correct':'sum','wrong':'sum','timestamp':'min'})
data_sub.head()

Unnamed: 0,user_id,correct,wrong,timestamp
0,2,2,0,1514383420
1,3,4,4,1434358533
2,5,2,2,1499859650
3,8,9,21,1480603432
4,14,0,1,1436368601


In [37]:
data_sub = data_sub.drop('timestamp', axis=1)
data = data.merge(data_sub[['user_id','correct', 'wrong']], how='outer')

In [38]:
data = data.fillna(data.mean())
data.head(10)

Unnamed: 0,user_id,discovered,passed,viewed,started_attempt,timestamp,correct,wrong
0,1,1,0,1,0,1472827464,7.060865,6.48159
1,2,9,9,9,2,1514383364,2.0,0.0
2,3,15,15,20,4,1434358476,4.0,4.0
3,5,1,1,1,0,1466156809,2.0,2.0
4,7,1,1,1,0,1521634660,7.060865,6.48159
5,8,109,84,154,37,1480603427,9.0,21.0
6,9,3,3,4,0,1484999434,7.060865,6.48159
7,11,1,0,1,1,1526140624,7.060865,6.48159
8,14,4,3,9,1,1436366155,0.0,1.0
9,16,50,49,117,21,1435674765,18.0,23.0


In [39]:
X = data.drop('user_id', axis=1)
X.head()

Unnamed: 0,discovered,passed,viewed,started_attempt,timestamp,correct,wrong
0,1,0,1,0,1472827464,7.060865,6.48159
1,9,9,9,2,1514383364,2.0,0.0
2,15,15,20,4,1434358476,4.0,4.0
3,1,1,1,0,1466156809,2.0,2.0
4,1,1,1,0,1521634660,7.060865,6.48159


In [40]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
tree = DecisionTreeClassifier()
param_grid = [{'max_depth':[1,2,3,4,5,6,7,8,9,10],
              'min_samples_split':[2,3,4,5,6,7,8,9,10],
              'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10]}]
search = GridSearchCV(estimator=tree, param_grid=param_grid, n_jobs=-1, verbose=1)
search.fit(x_train,y_train)
best_tree = search.best_estimator_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 900 candidates, totalling 2700 fits


[Parallel(n_jobs=-1)]: Done  47 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 1523 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed:   35.4s finished


In [41]:
best_tree = search.best_estimator_
y_pred = best_tree.predict(x_test)
search.best_score_

0.8659965337954939

In [42]:
roc_auc_score(y_test,y_pred)

0.775678243799101

In [53]:
rf = RandomForestClassifier(random_state=0)
parametrs_rf = {'criterion': ['gini', 'entropy'],
            'max_features' : [5, 6, 7],
            'n_estimators': [65, 70, 75],
            'max_depth': [5, 6, 7],
            'min_samples_leaf': [1, 2],
            'min_samples_split': [11, 12]}
grid_search_cv_clf = GridSearchCV(estimator=rf, param_grid=parametrs_rf, cv=5, n_jobs=-1, verbose=1, scoring = 'roc_auc')
grid_search_cv_clf.fit(x_train, y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  8.9min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_features': [5, 6, 7], 'n_estimators': [50, 60, 70], 'max_depth': [5, 6, 7], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [12, 14]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [54]:
best_rf = grid_search_cv_clf.best_estimator_
y_pred = best_rf.predict(x_test)
roc_auc_score(y_test,y_pred)

0.7706473235171927

In [55]:
best_rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=7, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=12,
            min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)