In [29]:
from feature_extraction.features import *
from feature_extraction.transformers import *
from pipelines.models import *
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.svm import LinearSVC, SVC

import dill
import pandas as pd
import numpy as np
import os
import time

%reload_ext autoreload

In [2]:
ts_load_data = time.time()

cleaner = TextCleanExtractor()
sb_tweets_df = pd.read_csv('data/sedentary_labeled_april.csv')
sb_tweets_df = sb_tweets_df.loc[:, ['text', 'hashtags', 'placename', 'first_person_sedentary_behavior']]
sb_tweets_df['clean_text'] = cleaner.transform(sb_tweets_df.text)

te_load_data = time.time()

print("Cleaned data in %.2f" % (te_load_data - ts_load_data))

n_samples = len(sb_tweets_df)
n_positives = sum(sb_tweets_df.first_person_sedentary_behavior == True)
print("[Sedentary Behavior] True labels: %d/%d (%.3f%%)" % (n_positives, n_samples, n_positives / n_samples * 100.0))

X_sb = sb_tweets_df.clean_text
y_sb = sb_tweets_df.first_person_sedentary_behavior

ts_load_model = time.time()
clf = get_ensemble_model()
te_load_model = time.time()

print("Loaded model in %.2f" % (te_load_model - ts_load_model))


ts_evaluate = time.time()
score = cross_val_score(clf, X_sb, y_sb, cv=5, scoring='roc_auc').mean()
te_evaluate = time.time()

print("Evaluated model in %.2f" % (te_evaluate - ts_evaluate))

print("Sedentary behavior AUC:", score)

Cleaned data in 1.12
[Sedentary Behavior] True labels: 215/2513 (8.556%)


In [3]:
%%time

feature_extractor = Pipeline([("feature_extraction", get_features()),
                              ('feature_selection', SelectFpr(f_classif))])

X_feats = feature_extractor.fit_transform(X_sb, y_sb)

Found 400000 word vectors.
CPU times: user 40.5 s, sys: 3.75 s, total: 44.3 s
Wall time: 42.8 s


In [261]:
%%time

parameters = {"C" : [0.01, 0.1, 1.0, 10.0, 100.0]}
lr = LogisticRegression(penalty='l2', solver='lbfgs')
cv = GridSearchCV(lr, 
                  param_grid=parameters,
                  cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                  refit=True,
                  scoring='roc_auc',
                  n_jobs=-1)

cv.fit(X_feats, y_sb)

print(cv.best_params_, cv.best_score_)

SyntaxError: invalid syntax (<unknown>, line 6)

In [260]:
%%time

parameters = {"n_estimators": [100, 200, 300, 400],
              "min_samples_split": [5, 10, 15],
              "max_depth": [7, 8, 9, 10]}

rf = RandomForestClassifier(pr)

cv = GridSearchCV(rf,
                  param_grid=parameters,
                  cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                  refit=True,
                  scoring='roc_auc',
                  n_jobs=-1, )

cv.fit(X_feats, y_sb)

print(cv.best_params_, cv.best_score_)

{'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 400} 0.887459047069
CPU times: user 19.3 s, sys: 1.51 s, total: 20.8 s
Wall time: 3min 27s


In [265]:
%%time

parameters = {"n_estimators": [100, 200, 300, 400],
              "min_samples_split": [5, 10, 15],
              "max_depth": [7, 8, 9, 10]}

et = ExtraTreesClassifier()

cv = GridSearchCV(et,
                  param_grid=parameters,
                  cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                  refit=True,
                  scoring='roc_auc',
                  n_jobs=-1)

cv.fit(X_feats, y_sb)

print(cv.best_params_, cv.best_score_)

{'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 300} 0.891395605412
CPU times: user 8.49 s, sys: 976 ms, total: 9.46 s
Wall time: 55.6 s


In [27]:
%%time

parameters = {"n_estimators": [100, 200, 300, 400],
              "max_depth": [7, 8, 9, 10]}

xgb = XGBClassifier()

cv = GridSearchCV(xgb,
                  param_grid=parameters,
                  cv=(n_splits=5, shuffle=True, random_state=42),
                  refit=True,
                  scoring='roc_auc',
                  n_jobs=-1)

cv.fit(X_feats, y_sb)

print(cv.best_params_, cv.best_score_)

{'max_depth': 8, 'n_estimators': 300} 0.900403792413
CPU times: user 27 s, sys: 1.97 s, total: 29 s
Wall time: 3min 50s


In [263]:
%%time

parameters = {"alpha": [0.1, 0.25, 0.5, 0.75, 1.0, 2.0, 5.0, 10.0, 20.0],
              "binarize": [0.0, 0.25, 0.50, 0.75, 1.0]}

bnb = BernoulliNB()

cv = GridSearchCV(bnb,
                  param_grid=parameters,
                  cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                  refit=True,
                  scoring='roc_auc',
                  n_jobs=-1)

cv.fit(X_feats, y_sb)

print(cv.best_params_, cv.best_score_)

{'binarize': 0.0, 'alpha': 5.0} 0.936748911366
CPU times: user 2.99 s, sys: 507 ms, total: 3.5 s
Wall time: 3.62 s


In [40]:
%%time

parameters = {"C": [0.01, 0.1, 1, 10, 100, 1000],
              "gamma": [1e-4, 1e-3, 1e-2, 1e-2]}

svc = SVC(kernel='rbf', n)

cv = GridSearchCV(svc,
                  param_grid=parameters,
                  cv=(n_splits=5, shuffle=True, random_state=42),
                  refit=True,
                  scoring='roc_auc',
                  n_jobs=-1)

cv.fit(X_feats, y_sb)

print(cv.best_params_, cv.best_score_)

{'gamma': 0.0001, 'C': 100} 0.856179345821
CPU times: user 5.86 s, sys: 886 ms, total: 6.75 s
Wall time: 1min 26s


In [2]:
cleaner = TextCleanExtractor()

sleep_tweets_df = pd.read_csv('data/sleep_labeled_dec22.csv')
sleep_tweets_df = sleep_tweets_df.loc[:, ['text', 'hashtags', 'placename', 'first_person_sleep_problem']]
sleep_tweets_df['clean_text'] = cleaner.transform(sleep_tweets_df.text)
n_samples = len(sleep_tweets_df)
n_positives = sum(sleep_tweets_df.first_person_sleep_problem == True)
print("[Sleep] True labels: %d/%d (%.3f%%)" % (n_positives, n_samples, n_positives / n_samples * 100.0))

sb_tweets_df = pd.read_csv('data/sedentary_labeled_jan11.csv')
sb_tweets_df = sb_tweets_df.loc[:, ['text', 'hashtags', 'placename', 'first_person_sedentary_behavior']]
sb_tweets_df['clean_text'] = cleaner.transform(sb_tweets_df.text)

n_samples = len(sb_tweets_df)
n_positives = sum(sb_tweets_df.first_person_sedentary_behavior == True)
print("[Sedentary Behavior] True labels: %d/%d (%.3f%%)" % (n_positives, n_samples, n_positives / n_samples * 100.0))

pa_tweets_df = pd.read_csv('data/pa_labeled_dec22.csv')
pa_tweets_df = pa_tweets_df.loc[:, ['text', 'hashtags', 'placename', 'first_person_physical_activity']]
pa_tweets_df['clean_text'] = cleaner.transform(pa_tweets_df.text)

n_samples = len(pa_tweets_df)
n_positives = sum(pa_tweets_df.first_person_physical_activity == True)
print("[Physical Activity] True labels: %d/%d (%.3f%%)" % (n_positives, n_samples, n_positives / n_samples * 100.0))
pa_tweets_df.head()

tweets_df = pd.concat((pa_tweets_df, sleep_tweets_df, sb_tweets_df), ignore_index=True)
tweets_df.head()

[Sleep] True labels: 93/1204 (7.724%)
[Sedentary Behavior] True labels: 97/2403 (4.037%)
[Physical Activity] True labels: 91/1810 (5.028%)


Unnamed: 0,clean_text,first_person_physical_activity,first_person_sedentary_behavior,first_person_sleep_problem,hashtags,placename,text
0,1 month out from my surgery as of yesterday & ...,1.0,,,,,1 month out from my surgery as of yesterday &a...
1,morning . here \ \ u2019s your windswept skies...,0.0,,,,"Toronto, Ontario",Morning. Here\\u2019s your windswept skies Tor...
2,someone talk me out of the barrelman bike / ru...,0.0,,,,"Toronto, Ontario",Someone talk me out of the Barrelman Bike/Run ...
3,successfully surprised bri ( and 10 other danc...,0.0,,,missdancesomuch,"Kingston, Ontario",Successfully surprised Bri (and 10 other dance...
4,a train with no tracks ? the world's first sma...,0.0,,,,"Fort Saskatchewan, Alberta",A train with no tracks? The world's first smar...


In [4]:
X = tweets_df.clean_text
y = tweets_df[['first_person_sleep_problem', 
               'first_person_sedentary_behavior', 
               'first_person_physical_activity']]

X_sleep = sleep_tweets_df.clean_text
y_sleep = sleep_tweets_df.first_person_sleep_problem

X_sb = sb_tweets_df.clean_text
y_sb = sb_tweets_df.first_person_sedentary_behavior

X_pa = pa_tweets_df.clean_text
y_pa = pa_tweets_df.first_person_physical_activity

indicators = [(X_sleep, y_sleep, 'sleep'), 
              (X_sb, y_sb, 'sedentary_behaviour'),
              (X_pa, y_pa, 'physical_activity')]

clf = get_ensemble_model()

In [7]:
%%time
for (X, y, name) in indicators:
    score = cross_val_score(clf, X, y, cv=5, scoring='roc_auc').mean()
    print(name, score)

sleep 0.885564074814
sedentary_behaviour 0.528384266721
physical_activity 0.817835667519


In [5]:
indicators = [(X_sb, y_sb, 'sedentary_behaviour')]

In [16]:
%%time
for (X, y, name) in indicators:
    clf = get_ensemble_model()
    clf.fit(X, y)
    filename = name + '_ensemble.pkl'
    path = os.path.join('./model', filename)
    dill.dump(clf, open(path, 'wb'))
    print("...wrote to", path)

Found 400000 word vectors.
...wrote to ./model/sedentary_behaviour_ensemble.pkl
CPU times: user 2min 27s, sys: 17 s, total: 2min 44s
Wall time: 2min 43s
