In [28]:
import numpy as np
import pandas as pd
import pickle
import dill
import time
from collections import Counter

from database.query import DataAccess
from bson.objectid import ObjectId
from feature_extraction.transformers import *
from database.utils import get_train_test_data 

from sklearn.metrics import roc_auc_score

%reload_ext autoreload
%autoreload 2

In [2]:
ensemble_indicators = ['sleep_ensemble_latest',
                       'physical_activity_ensemble_latest',
                       'sedentary_behaviour_ensemble_latest']
ensemble_models = []

for indicator in ensemble_indicators:
    with open('./model/%s.pkl' % indicator, 'rb') as f:
        clf = dill.load(f)
        ensemble_models.append((clf, indicator))

In [5]:
train_test_data = get_train_test_data()

In [60]:
results = {}

for _, _, X_test, y_test, indicator in train_test_data:
    results[indicator] = {}
    index = [indicator for _, indicator in ensemble_models].index(indicator + "_ensemble_latest")
    clf = ensemble_models[index][0]
    y_proba = clf.predict_proba(X_test)
    y_pred = clf.predict(X_test)
    auc = roc_auc_score(y_test, y_proba[:, 1])
    print(indicator, auc)
    
    results[indicator]['y_proba'] = y_proba[:, 1]
    results[indicator]['y_pred'] = y_pred
    results[indicator]['y_true'] = y_test
    results[indicator]['text'] = X_test

sleep 0.831136363636
sedentary_behaviour 0.804333695526
physical_activity 0.871143375681


In [62]:
pa = pd.DataFrame(results['physical_activity'])
false_pos = (pa.y_pred == 1) & (pa.y_true == 0)
false_neg = (pa.y_pred == 0) & (pa.y_true == 1)

pa.loc[false_neg]

Unnamed: 0,text,y_pred,y_proba,y_true
20,sunset hike @ dog mountain,0.0,0.211699,1
27,hillsprints workout 6weekchallenge myfitnessce...,0.0,0.388819,1
32,p90x workout - sample workouts,0.0,0.13028,1
43,well deserved soak after yoga � � bath bathtim...,0.0,0.202382,1
53,9:30 flow & 12:00 restorative . � � yoga abbot...,0.0,0.14229,1
58,sample glutes workout ❤ ️ ❤ ️ fit fitness good...,0.0,0.432646,1
99,mixing it up this morning ! ashtanga yoga with...,0.0,0.2198,1
104,big wave maeve shredding in jiquilillo surferg...,0.0,0.277297,1
125,weekend 4 of 4 yoga as a healing modality oste...,0.0,0.428576,1
126,getting hyped about tonight's workout chestday...,0.0,0.414178,1


In [63]:
pa.loc[false_pos]

Unnamed: 0,text,y_pred,y_proba,y_true
202,because the journey is the destination . gymti...,1.0,0.950208,0
204,leg day sample exercise video � � � � fitness ...,1.0,0.860784,0
206,problems hit the gym they all workout,1.0,0.979198,0
207,the life of a basketball mom . basketball ball...,1.0,0.781284,0
208,"cardio , core , chest and back ! homegym cross...",1.0,0.974566,0
209,very difficult to do yoga around here . catyoga,1.0,0.965548,0
210,high good morning ! ! ! mornigwalk morning mor...,1.0,0.963018,0
211,look whats back at redlineconditioning sleds f...,1.0,0.742479,0
212,end result ... . . . . . fitness cardio cardio...,1.0,0.982456,0
213,its so beautiful out today ! i had to drop my ...,1.0,0.986426,0


In [56]:
pa = pd.DataFrame(results['sedentary_behaviour'])
false_pos = (pa.y_pred == 1) & (pa.y_true == 0)
false_neg = (pa.y_pred == 0) & (pa.y_true == 1)

pa.loc[false_neg]

Unnamed: 0,text,y_pred,y_proba,y_true
2,holy fuck we pushed it to a game 7 unreal . th...,0.0,0.442229,1.0
3,leafs have haven't played well most of the game,0.0,0.367587,1.0
4,raptors on to the second round ! wethenorth � �,0.0,0.430499,1.0
7,"leafs on one tv , ninja on the other",0.0,0.241074,1.0
8,as a fan i can honestly say i will not miss re...,0.0,0.236194,1.0
10,leafs played a good game too ! sorry toronto c...,0.0,0.417711,1.0
12,... bruins need to get a lead for a change ......,0.0,0.292988,1.0
14,it's like the raptors enjoy playing catch up,0.0,0.383631,1.0
18,is lebron going off tn,0.0,0.273651,1.0
25,unless fvv is a god ... the raptors are in a l...,0.0,0.250287,1.0


In [57]:
pa.loc[false_pos]

Unnamed: 0,text,y_pred,y_proba,y_true
287,the leafs were horrible at drawing penalties a...,1.0,0.566721,0.0
290,"fucking hell , leafs love making it hard on th...",1.0,0.594239,0.0
303,haha just checked the score of the leafs game ...,1.0,0.855699,0.0
312,maaarrlleeaauuu ! ! ! leafs lead 1-0 ! ! ! tml...,1.0,0.864750,0.0
316,"bruins are done . regroup for game 7 at home ,...",1.0,0.631482,0.0
321,i see a lot of leafs fans complaining about se...,1.0,0.565594,0.0
338,bharatanenenu watching,1.0,0.921048,0.0
339,your not watching the game the jets are the on...,1.0,0.848802,0.0
342,if pittsburgh wins the cup this year im never ...,1.0,0.876802,0.0
344,lol just overheard raptors highlights in the w...,1.0,0.908397,0.0


In [65]:
sleep = pd.DataFrame(results['sleep'])
false_pos = (sleep.y_pred == 1) & (sleep.y_true == 0)
false_neg = (sleep.y_pred == 0) & (sleep.y_true == 1)

sleep.loc[false_neg]

Unnamed: 0,text,y_pred,y_proba,y_true
2,i would sleep 24 hours straight .,0.0,0.36473,1
19,almost 2am ... still awake ... need to be up i...,0.0,0.361036,1
28,only slept for 4 hours � �,0.0,0.406703,1
42,got like what 2 hours sleep,0.0,0.404514,1
53,shitty sleep,0.0,0.279658,1
87,love n ' hugs right back ! slept for about two...,0.0,0.3299,1
91,sleep deprived,0.0,0.316077,1
109,slept 4 hours yesterday . get 2 hours 45 mins ...,0.0,0.472065,1


In [68]:
sleep.loc[false_pos]

Unnamed: 0,text,y_pred,y_proba,y_true
1,"me , trying to be productive and useful today ...",1.0,0.75926,0
9,been getting 8 hours of sleep consistently fee...,1.0,0.75079,0
10,12 hours of sleep � � � �,1.0,0.790751,0
12,15 hours of sleep,1.0,0.525085,0
15,"23 hours of sleep , 1 hour of mania ?",1.0,0.796156,0
23,we both fell asleep at like 7pm � � � � 1am * ...,1.0,0.616816,0
25,lmaooo � � get sleep fr haha i'm the same when...,1.0,0.763369,0
32,i fully intended to nap for like 2 hours ... i...,1.0,0.568825,0
39,"ahhh , i finally got 8 hours sleep . it took m...",1.0,0.632465,0
40,10 hours of sleep � �,1.0,0.787086,0
