In [51]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
import matplotlib.axes._axes as axes
sns.set()

In [52]:
# import scoring and utility functions from sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_blobs
from sklearn.impute import SimpleImputer
#
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [53]:
cwd = os.getcwd()
mood_data = pd.read_csv(f"moodring_data_v2.csv")
mood_data.drop(['Patient_ID', 'week_num'], axis=1, inplace=True)
mood_data.replace([np.inf, -np.inf], np.nan, inplace=True)
mood_data.head()

Unnamed: 0,ar_daily_mostcommonactivity,ar_daily_sumstationary,ar_daily_sumvehicle,ar_daily_summobile,ar_daily_activitychangecount,ar_daily_count,ar_daily_countuniqueactivities,call_incoming_daily_count,call_incoming_daily_distinctcontacts,call_incoming_daily_meanduration,...,screen_daily_avgdurationunlock,screen_daily_episodepersensedminutesunlock,screen_daily_firstuseafter00unlock,screen_daily_stddurationunlock,screen_daily_maxdurationunlock,screen_daily_sumdurationunlock,screen_daily_mindurationunlock,wifi_daily_countscans,wifi_daily_uniquedevices,wifi_daily_countscansmostuniquedevice
0,,,,,,,,,,,...,,,,,,,,1266.571429,2.571429,825.142857
1,,,,,,,,,,,...,,,,,,,,1161.428571,3.142857,661.571429
2,,,,,,,,,,,...,,,,,,,,1233.285714,2.285714,599.571429
3,,,,,,,,,,,...,,,,,,,,859.0,2.428571,221.857143
4,,,,,,,,,,,...,,,,,,,,1389.857143,2.857143,1109.857143


In [54]:
mood_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Columns: 111 entries, ar_daily_mostcommonactivity to wifi_daily_countscansmostuniquedevice
dtypes: float64(111)
memory usage: 400.8 KB


In [55]:
# remove any col with > 60% missing data

cols_to_drop = []
total_rows = mood_data.shape[0]
for col in mood_data.columns:
    if mood_data[col].isna().sum() >= 0.6 * total_rows:
        cols_to_drop.append(col)

In [56]:
new_df = mood_data.drop(columns=cols_to_drop)
new_df.shape

(462, 67)

In [57]:
new_df[new_df.isnull().any(axis=1)]

Unnamed: 0,ar_daily_mostcommonactivity,ar_daily_sumstationary,ar_daily_sumvehicle,ar_daily_summobile,ar_daily_activitychangecount,ar_daily_count,ar_daily_countuniqueactivities,call_incoming_daily_count,call_incoming_daily_distinctcontacts,call_incoming_daily_meanduration,...,screen_daily_avgdurationunlock,screen_daily_episodepersensedminutesunlock,screen_daily_firstuseafter00unlock,screen_daily_stddurationunlock,screen_daily_maxdurationunlock,screen_daily_sumdurationunlock,screen_daily_mindurationunlock,wifi_daily_countscans,wifi_daily_uniquedevices,wifi_daily_countscansmostuniquedevice
0,,,,,,,,,,,...,,,,,,,,1266.571429,2.571429,825.142857
1,,,,,,,,,,,...,,,,,,,,1161.428571,3.142857,661.571429
2,,,,,,,,,,,...,,,,,,,,1233.285714,2.285714,599.571429
3,,,,,,,,,,,...,,,,,,,,859.000000,2.428571,221.857143
4,,,,,,,,,,,...,,,,,,,,1389.857143,2.857143,1109.857143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,1389.714286,3.0,3.571429,153.000000,1114.833957,74.751526,125.183269,0.571429,0.571429,72.357143,...,,,,,,,,,,
458,1277.428571,3.0,3.571429,156.571429,1183.589995,76.669657,63.663386,,,,...,,,,,,,,,,
459,1367.142857,3.0,3.285714,169.857143,1163.327529,53.040324,103.707043,0.428571,0.428571,17.428571,...,,,,,,,,,,
460,1376.857143,3.0,3.714286,169.571429,1174.505752,61.477840,89.215538,0.285714,0.285714,5.571429,...,,,,,,,,,,


In [61]:
def impute_data(data, strategy):
    imputer = SimpleImputer(strategy=strategy)
    return imputer.fit_transform(data)

In [62]:
imputed_df = impute_data(new_df, 'median')

In [66]:
imputed_df = pd.DataFrame(data=imputed_df, columns=new_df.columns)
imputed_df.head()

Unnamed: 0,ar_daily_mostcommonactivity,ar_daily_sumstationary,ar_daily_sumvehicle,ar_daily_summobile,ar_daily_activitychangecount,ar_daily_count,ar_daily_countuniqueactivities,call_incoming_daily_count,call_incoming_daily_distinctcontacts,call_incoming_daily_meanduration,...,screen_daily_avgdurationunlock,screen_daily_episodepersensedminutesunlock,screen_daily_firstuseafter00unlock,screen_daily_stddurationunlock,screen_daily_maxdurationunlock,screen_daily_sumdurationunlock,screen_daily_mindurationunlock,wifi_daily_countscans,wifi_daily_uniquedevices,wifi_daily_countscansmostuniquedevice
0,1502.928572,3.0,3.0,193.357143,1031.864848,87.484143,26.649598,0.857143,0.714286,106.021645,...,0.176618,363.254831,76.395068,0.039294,6.56352,13.603958,183.13006,1266.571429,2.571429,825.142857
1,1502.928572,3.0,3.0,193.357143,1031.864848,87.484143,26.649598,0.857143,0.714286,106.021645,...,0.176618,363.254831,76.395068,0.039294,6.56352,13.603958,183.13006,1161.428571,3.142857,661.571429
2,1502.928572,3.0,3.0,193.357143,1031.864848,87.484143,26.649598,0.857143,0.714286,106.021645,...,0.176618,363.254831,76.395068,0.039294,6.56352,13.603958,183.13006,1233.285714,2.285714,599.571429
3,1502.928572,3.0,3.0,193.357143,1031.864848,87.484143,26.649598,0.857143,0.714286,106.021645,...,0.176618,363.254831,76.395068,0.039294,6.56352,13.603958,183.13006,859.0,2.428571,221.857143
4,1502.928572,3.0,3.0,193.357143,1031.864848,87.484143,26.649598,0.857143,0.714286,106.021645,...,0.176618,363.254831,76.395068,0.039294,6.56352,13.603958,183.13006,1389.857143,2.857143,1109.857143


In [67]:
imputed_df[imputed_df.isnull().any(axis=1)]

Unnamed: 0,ar_daily_mostcommonactivity,ar_daily_sumstationary,ar_daily_sumvehicle,ar_daily_summobile,ar_daily_activitychangecount,ar_daily_count,ar_daily_countuniqueactivities,call_incoming_daily_count,call_incoming_daily_distinctcontacts,call_incoming_daily_meanduration,...,screen_daily_avgdurationunlock,screen_daily_episodepersensedminutesunlock,screen_daily_firstuseafter00unlock,screen_daily_stddurationunlock,screen_daily_maxdurationunlock,screen_daily_sumdurationunlock,screen_daily_mindurationunlock,wifi_daily_countscans,wifi_daily_uniquedevices,wifi_daily_countscansmostuniquedevice


In [68]:
from pandas import set_option 
from pandas.plotting import scatter_matrix 
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline 

from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Lasso 
from sklearn.linear_model import ElasticNet 
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.ensemble import ExtraTreesRegressor 
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error

In [71]:
imputed_df.info();

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 67 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   ar_daily_mostcommonactivity                 462 non-null    float64
 1   ar_daily_sumstationary                      462 non-null    float64
 2   ar_daily_sumvehicle                         462 non-null    float64
 3   ar_daily_summobile                          462 non-null    float64
 4   ar_daily_activitychangecount                462 non-null    float64
 5   ar_daily_count                              462 non-null    float64
 6   ar_daily_countuniqueactivities              462 non-null    float64
 7   call_incoming_daily_count                   462 non-null    float64
 8   call_incoming_daily_distinctcontacts        462 non-null    float64
 9   call_incoming_daily_meanduration            462 non-null    float64
 10  call_incoming_

In [None]:
imputed_df.corr()

In [None]:
# Test options and evaluation metric 
num_folds = 10 
seed = 7
scoring = ' neg_mean_squared_error'

Unnamed: 0,ar_daily_mostcommonactivity,ar_daily_sumstationary,ar_daily_sumvehicle,ar_daily_summobile,ar_daily_activitychangecount,ar_daily_count,ar_daily_countuniqueactivities,call_incoming_daily_count,call_incoming_daily_distinctcontacts,call_incoming_daily_meanduration,...,screen_daily_avgdurationunlock,screen_daily_episodepersensedminutesunlock,screen_daily_firstuseafter00unlock,screen_daily_stddurationunlock,screen_daily_maxdurationunlock,screen_daily_sumdurationunlock,screen_daily_mindurationunlock,wifi_daily_countscans,wifi_daily_uniquedevices,wifi_daily_countscansmostuniquedevice
ar_daily_mostcommonactivity,1.000000,0.353754,-0.194288,0.358689,0.627336,0.211386,0.182435,0.281326,0.308286,0.042150,...,0.399980,0.219426,0.115753,-0.115875,-0.032892,0.097721,0.025267,0.132697,-0.097125,0.201007
ar_daily_sumstationary,0.353754,1.000000,0.464088,0.480165,0.645186,-0.065327,-0.112227,0.022148,0.031445,0.065058,...,0.229913,0.281434,0.091766,-0.067575,0.011264,0.083788,-0.054626,0.293795,0.049480,0.183380
ar_daily_sumvehicle,-0.194288,0.464088,1.000000,0.288923,0.049131,-0.061753,0.110530,-0.075844,-0.090073,-0.011093,...,-0.001143,0.200821,0.099258,-0.043767,0.029579,0.078100,-0.080269,0.191310,0.086317,-0.019784
ar_daily_summobile,0.358689,0.480165,0.288923,1.000000,0.328581,0.183963,-0.043237,0.010620,0.023260,-0.005283,...,0.168106,0.353471,0.197387,-0.082092,0.051852,0.182256,-0.169883,0.083497,0.038016,0.049266
ar_daily_activitychangecount,0.627336,0.645186,0.049131,0.328581,1.000000,-0.211587,-0.011367,0.087084,0.111768,0.116073,...,0.274875,0.255500,0.041712,-0.037775,0.026704,0.057589,0.012810,0.292867,0.025941,0.263669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
screen_daily_sumdurationunlock,0.097721,0.083788,0.078100,0.182256,0.057589,0.127384,-0.018784,-0.006936,-0.001345,0.029512,...,0.077924,0.504801,0.887197,0.195459,0.657022,1.000000,-0.050778,-0.136309,-0.050451,-0.047304
screen_daily_mindurationunlock,0.025267,-0.054626,-0.080269,-0.169883,0.012810,0.196208,0.113983,-0.037799,-0.037965,-0.153204,...,-0.102567,-0.359385,-0.106729,0.079534,0.017945,-0.050778,1.000000,-0.117802,-0.029350,0.021326
wifi_daily_countscans,0.132697,0.293795,0.191310,0.083497,0.292867,-0.297899,-0.072784,0.174833,0.175330,0.088776,...,0.499259,0.502053,-0.049956,-0.070350,-0.091637,-0.136309,-0.117802,1.000000,0.291070,0.587152
wifi_daily_uniquedevices,-0.097125,0.049480,0.086317,0.038016,0.025941,-0.074485,-0.076851,0.073885,0.039859,-0.011775,...,0.144238,0.047839,-0.028668,-0.013286,-0.039268,-0.050451,-0.029350,0.291070,1.000000,-0.033622


In [73]:
# Test options and evaluation metric 
num_folds = 10 
seed = 7
scoring = ' neg_mean_squared_error'