In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd
import pickle
import dill
import time
from collections import Counter

from database.query import DataAccess
from bson.objectid import ObjectId
from feature_extraction.transformers import *

%reload_ext autoreload
%autoreload 2

In [4]:
ensemble_indicators = ['sleep_ensemble_latest',
                       'physical_activity_ensemble_latest',
                       'sedentary_behaviour_ensemble_latest']
ensemble_models = []

for indicator in ensemble_indicators:
    with open('./model/%s.pkl' % indicator, 'rb') as f:
        clf = dill.load(f)
        ensemble_models.append((clf, indicator))

In [6]:
tstart = time.time()

tweets_df = DataAccess.sample_control(lower=0.0, upper=0.05)

cleaner = TextCleanExtractor()
tweets_df['clean_text'] = cleaner.transform(tweets_df.text)
empty = tweets_df.clean_text.apply(lambda x: x == '')
tweets_df = tweets_df.loc[~empty]
tend = time.time()

print("Sampled %d tweets in %.2f secs" % (len(tweets_df), tend - tstart))
tweets_df.head()

Sampled 323424 tweets in 346.54 secs


Unnamed: 0_level_0,created_at,lang,retweeted,text,place.full_name,place.country,entities.hashtags,user.created_at,user.favourites_count,user.followers_count,user.friends_count,user.statuses_count,user.verified,latitude,longitude,clean_text
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5a26f3d2b13879ecbb2efcb1,Mon Oct 30 11:21:11 +0000 2017,en,False,Who’s more hyped for WW2 more then me? The ans...,"Toronto, Ontario",Canada,[],Thu Dec 22 03:49:12 +0000 2016,16257,1852,2821,3971,False,,,who ’ s more hyped for ww2 more then me ? the ...
5a26f3d2b13879ecbb2efcd0,Mon Oct 30 11:21:23 +0000 2017,en,False,Someone talk me out of the Barrelman Bike/Run ...,"Toronto, Ontario",Canada,[],Mon May 30 02:43:24 +0000 2011,31067,878,1254,9702,False,,,someone talk me out of the barrelman bike / ru...
5a26f3d2b13879ecbb2efcfd,Mon Oct 30 11:21:56 +0000 2017,en,False,Is it really Monday already,"Mississauga, Ontario",Canada,[],Mon Apr 11 18:23:36 +0000 2011,858,165,534,10008,False,,,is it really monday already
5a26f3d2b13879ecbb2efd4b,Mon Oct 30 11:22:45 +0000 2017,en,False,The world's first smart railway opens in China...,"Fort Saskatchewan, Alberta",Canada,[],Thu Jan 16 05:20:47 +0000 2014,537,242,811,352,False,,,the world's first smart railway opens in china...
5a26f3d2b13879ecbb2efd58,Mon Oct 30 11:22:51 +0000 2017,en,False,Many trees have lost their leaves. Others are ...,"Halifax, Nova Scotia",Canada,[],Fri Apr 14 11:47:51 +0000 2017,3286,133,164,781,False,,,many trees have lost their leaves . others are...


In [7]:
tstart = time.time()

for clf, model_name in ensemble_models:
    column_name = model_name + '_predict'
    tweets_df[column_name] = clf.predict_proba(tweets_df.clean_text)[:, 1]
    
tend = time.time()

print("Completed %d predictions in %.3f secs" % (len(tweets_df), tend - tstart))
    
tweets_df.head()

Completed 323424 predictions in 971.280 secs


Unnamed: 0_level_0,created_at,lang,retweeted,text,place.full_name,place.country,entities.hashtags,user.created_at,user.favourites_count,user.followers_count,user.friends_count,user.statuses_count,user.verified,latitude,longitude,clean_text,sleep_ensemble_latest_predict,physical_activity_ensemble_latest_predict,sedentary_behaviour_ensemble_latest_predict
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5a26f3d2b13879ecbb2efcb1,Mon Oct 30 11:21:11 +0000 2017,en,False,Who’s more hyped for WW2 more then me? The ans...,"Toronto, Ontario",Canada,[],Thu Dec 22 03:49:12 +0000 2016,16257,1852,2821,3971,False,,,who ’ s more hyped for ww2 more then me ? the ...,0.212887,0.028967,0.106441
5a26f3d2b13879ecbb2efcd0,Mon Oct 30 11:21:23 +0000 2017,en,False,Someone talk me out of the Barrelman Bike/Run ...,"Toronto, Ontario",Canada,[],Mon May 30 02:43:24 +0000 2011,31067,878,1254,9702,False,,,someone talk me out of the barrelman bike / ru...,0.217633,0.144254,0.107647
5a26f3d2b13879ecbb2efcfd,Mon Oct 30 11:21:56 +0000 2017,en,False,Is it really Monday already,"Mississauga, Ontario",Canada,[],Mon Apr 11 18:23:36 +0000 2011,858,165,534,10008,False,,,is it really monday already,0.218436,0.028698,0.10833
5a26f3d2b13879ecbb2efd4b,Mon Oct 30 11:22:45 +0000 2017,en,False,The world's first smart railway opens in China...,"Fort Saskatchewan, Alberta",Canada,[],Thu Jan 16 05:20:47 +0000 2014,537,242,811,352,False,,,the world's first smart railway opens in china...,0.220941,0.035722,0.109419
5a26f3d2b13879ecbb2efd58,Mon Oct 30 11:22:51 +0000 2017,en,False,Many trees have lost their leaves. Others are ...,"Halifax, Nova Scotia",Canada,[],Fri Apr 14 11:47:51 +0000 2017,3286,133,164,781,False,,,many trees have lost their leaves . others are...,0.215815,0.029233,0.106242


In [8]:
sleep = tweets_df['sleep_ensemble_latest_predict'] > 0.3
sb = tweets_df['sedentary_behaviour_ensemble_latest_predict'] > 0.30
pa = tweets_df['physical_activity_ensemble_latest_predict'] > 0.30

print("# sleep tweets: %d" % sum(sleep))
print("# sedentary behavior tweets: %d" % sum(sb))
print("# physical activity tweets: %d" % sum(pa))

predicted = tweets_df.loc[sleep | sb | pa]

# sleep tweets: 394
# sedentary behavior tweets: 2713
# physical activity tweets: 723


In [66]:
predicted

Unnamed: 0_level_0,created_at,lang,retweeted,text,place.full_name,place.country,entities.hashtags,user.created_at,user.favourites_count,user.followers_count,user.friends_count,user.statuses_count,user.verified,latitude,longitude,clean_text,sleep_ensemble_predict,physical_activity_ensemble_predict,sedentary_behaviour_ensemble_predict
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5a26f3d4b13879ecbb2f0b6f,Mon Oct 30 16:19:29 +0000 2017,en,False,After reading Lois Elhert's book Ss thought of...,"LaSalle, Ontario",Canada,[],Wed Oct 21 18:29:24 +0000 2009,703,426,170,3828,False,,,after reading lois elhert's book ss thought of...,0,0,1
5a26f3d6b13879ecbb2f1ae2,Mon Oct 30 20:15:47 +0000 2017,en,False,@Stranger_Things Im finishing the show as we s...,Canada,Canada,[],Thu Jun 04 05:01:40 +0000 2009,4851,672,644,72841,False,,,im finishing the show as we speak . 20 minutes...,0,0,1
5a26f3dbb13879ecbb2f4bba,Wed Nov 15 15:35:50 +0000 2017,en,False,I need to chill.,"Montréal, Québec",Canada,[],Thu Apr 26 04:25:12 +0000 2012,11896,893,889,22593,False,,,i need to chill .,0,0,1
5a26f3dbb13879ecbb2f4cdc,Wed Nov 15 15:37:06 +0000 2017,en,False,Good Morning #NAIT. What a win last night - #G...,"Edmonton, Alberta",Canada,"[{'indices': [13, 18], 'text': 'NAIT'}, {'indi...",Wed Sep 07 17:32:48 +0000 2011,1853,1010,502,3329,False,,,good morning nait . what a win last night - go...,0,0,1
5a26f3dbb13879ecbb2f4ecf,Wed Nov 15 15:39:13 +0000 2017,en,False,@MUCCPride Day 3 curling with #wellness10 Time...,"Melfort, Saskatchewan",Canada,"[{'indices': [30, 41], 'text': 'wellness10'}, ...",Wed Dec 21 20:59:50 +0000 2011,1075,213,248,1721,False,,,day 3 curling with wellness 10 time to play so...,0,1,0
5a26f3dbb13879ecbb2f4f5f,Wed Nov 15 15:39:56 +0000 2017,en,False,"@thehill Honestly now, #Pompeo been silent sin...","Sackville, New Brunswick",Canada,"[{'indices': [23, 30], 'text': 'Pompeo'}, {'in...",Tue May 17 13:09:03 +0000 2011,7996,60,1829,15202,False,,,"honestly now , pompeo been silent since confir...",0,0,1
5a26f3deb13879ecbb2f66fd,Wed Nov 15 16:04:47 +0000 2017,en,False,@Addleben @Kidsdoc1Rick There are days I take ...,"Edmonton, Alberta",Canada,[],Sat Jul 28 21:35:43 +0000 2007,10677,1448,1378,16394,False,,,there are days i take the fat bike out just so...,0,1,0
5a26f3e1b13879ecbb2f80c5,Wed Nov 15 16:32:31 +0000 2017,en,False,"Workout, backyard cleanup, amazing breakfast a...","Georgina, Ontario",Canada,[],Wed Nov 30 20:17:07 +0000 2011,5857,337,328,3564,False,,,"workout , backyard cleanup , amazing breakfast...",0,1,0
5a26f3e2b13879ecbb2f864b,Wed Nov 15 16:38:07 +0000 2017,en,False,Back at the gym it’s leg day workin these @zim...,"Aurora, Ontario",Canada,"[{'indices': [70, 100], 'text': 'Bilateraltota...",Tue Apr 29 21:50:33 +0000 2014,2258,130,814,2074,False,,,back at the gym it ’ s leg day workin these pe...,0,1,0
5a26f3e5b13879ecbb2fa3db,Wed Nov 15 17:08:51 +0000 2017,en,False,If you read the Talmud you will see why Israel...,"Greater Vancouver, British Columbia",Canada,[],Tue Mar 09 00:33:39 +0000 2010,41,901,990,6311,False,,,if you read the talmud you will see why israel...,0,0,1


In [9]:
predicted.to_csv('./data/sampled_ensemble_proba_0-05perc.csv', columns=['created_at',
                          'clean_text',
                          'sleep_ensemble_latest_predict',
                          'physical_activity_ensemble_latest_predict',
                          'sedentary_behaviour_ensemble_latest_predict'], sep='\t')

In [76]:
predicted.head()

Unnamed: 0_level_0,created_at,lang,retweeted,text,place.full_name,place.country,entities.hashtags,user.created_at,user.favourites_count,user.followers_count,user.friends_count,user.statuses_count,user.verified,latitude,longitude,clean_text,sleep_ensemble_predict,physical_activity_ensemble_predict,sedentary_behaviour_ensemble_predict
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5a26f3d6b13879ecbb2f2351,Mon Oct 30 21:19:24 +0000 2017,en,False,I got a seat home on the 1630 LSW train tonigh...,"Burlington, Ontario",Canada,[],Thu Jul 12 14:33:49 +0000 2012,13704,660,838,25856,False,,,i got a seat home on the 1630 lsw train tonigh...,0,0,1
5a26f3dfb13879ecbb2f7112,Wed Nov 15 16:16:04 +0000 2017,en,False,"@BrianBiscieglia wow, just read up more about ...","Halifax, Nova Scotia",Canada,[],Mon May 18 03:57:57 +0000 2009,32488,2442,1712,82906,False,,,"wow , just read up more about him , one of the...",0,0,1
5a26f3e7b13879ecbb2fb220,Wed Nov 15 17:23:07 +0000 2017,en,False,@Daniel_DeSanto @Raptors @PopeyesCA worked for...,"Toronto, Ontario",Canada,"[{'indices': [93, 101], 'text': 'toronto'}]",Wed Jun 16 15:18:37 +0000 2010,5265,768,2660,10483,False,,,worked for me with no issues at 273 yonge st l...,0,0,1
5a26f43bb13879ecbb328d49,Thu Nov 16 06:55:46 +0000 2017,en,False,Exhausted all day\n\nMake a point of going to ...,"Calgary, Alberta",Canada,[],Thu Dec 30 22:14:16 +0000 2010,10469,1400,1743,17133,False,,,exhausted all day make a point of going to bed...,1,0,0
5a26f440b13879ecbb32a6c1,Thu Nov 16 09:07:09 +0000 2017,en,False,Night Windsor. Thanks for the fun. Yes I'm goi...,"London, Ontario",Canada,[],Thu Dec 31 07:09:33 +0000 2009,9739,34756,1985,6869,True,,,night windsor . thanks for the fun . yes i'm g...,1,0,0


In [68]:
%%time

sleep = tweets_df['sleep_ensemble_predict']
sb = tweets_df['sedentary_behaviour_ensemble_predict']
pa = tweets_df['physical_activity_ensemble_predict']

DataAccess.write_labels(sleep, 'sleep_ensemble_predict')
DataAccess.write_labels(sb, 'sedentary_behaviour_ensemble_predict')
DataAccess.write_labels(pa, 'physical_activity_ensemble_predict')

CPU times: user 6min 59s, sys: 9min 42s, total: 16min 42s
Wall time: 1h 50min 8s


In [24]:
'./data/sampled_ensemble_proba_%d-%dperc.csv' % (i, i+5)

'./data/sampled_ensemble_proba_100-105perc.csv'

In [20]:
%%time

y_preds = tweets_df.loc[:, ['sleep_ensemble_predict',
                  'sedentary_behaviour_ensemble_predict',
                  'physical_activity_ensemble_predict']]

DataAccess.write_labels_batch(y_preds)

CPU times: user 4min 37s, sys: 4min 3s, total: 8min 40s
Wall time: 1h 17min 43s


In [11]:
tweets_df.head()

Unnamed: 0_level_0,created_at,lang,retweeted,text,user.created_at,user.favourites_count,user.followers_count,user.friends_count,user.statuses_count,user.verified,place.full_name,place.country,entities.hashtags,latitude,longitude,clean_text
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5a26f3d2b13879ecbb2efd0b,Mon Oct 30 11:22:04 +0000 2017,en,False,"""Pastor Doug Batchelor and FRANC0IS (pope)"" \n...",Wed May 18 17:33:55 +0000 2011,48991,523,2130,51510,False,Canada,Canada,[],,,""" pastor doug batchelor and franc 0is ( pope )..."
5a26f3d2b13879ecbb2efd51,Mon Oct 30 11:22:51 +0000 2017,en,False,Relaxing Sunday afternoon while overlooking La...,Wed Dec 01 16:34:48 +0000 2010,790,32,17,1157,False,"North Bay, Ontario",Canada,[],,,relaxing sunday afternoon while overlooking la...
5a26f3d2b13879ecbb2efd93,Mon Oct 30 11:23:23 +0000 2017,en,False,"Gay A State Of Disgrace""...?\nPastor Doug Bat...",Wed May 18 17:33:55 +0000 2011,48993,523,2130,51512,False,Canada,Canada,[],,,"gay a state of disgrace "" ... ? pastor doug ba..."
5a26f3d3b13879ecbb2efe3a,Mon Oct 30 14:15:01 +0000 2017,en,False,#Carpool #StCatharinesNiagaraRegionalMunicipal...,Tue Nov 03 13:22:28 +0000 2009,11,1711,664,223736,False,"Niagara-on-the-Lake, Ontario",Canada,"[{'text': 'Carpool', 'indices': [0, 8]}, {'tex...",43.290392,-79.137854,carpool stcatharinesniagararegionalmunicipalit...
5a26f3d3b13879ecbb2eff20,Mon Oct 30 14:16:03 +0000 2017,en,False,Squeeee! I spent a lot of my childhood at Onta...,Sat Aug 09 22:23:52 +0000 2008,8934,2106,1513,17707,False,"Toronto, Ontario",Canada,"[{'text': 'TOhistory', 'indices': [89, 99]}]",,,squeee ! i spent a lot of my childhood at onta...
