### classifier model for predicting mental health indicators using screen and calling features

In [29]:
import pandas as pd
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
import plotly.express as px

In [30]:
datapath = "/home/mmh/flaskDashboard/data/processedData"
screenFeaturesFilename = "ios_screen_features"
callingFeaturesFilename = "ios_calling_features"
labelsFilename = "labels"

In [31]:
screenFeatures = pd.read_csv(os.path.join(datapath, screenFeaturesFilename))
callingFeatures = pd.read_csv(os.path.join(datapath, callingFeaturesFilename))
labels = pd.read_csv(os.path.join(datapath, labelsFilename))

In [62]:
labels.dass.value_counts()

3.0    899
2.0    703
1.0    645
0.0    246
Name: dass, dtype: int64

In [32]:
# screenFeatures.info(), callingFeatures.info(), labels.info()

In [109]:
screenFeaturesAgg = screenFeatures.groupby(['participant', 'device']).mean().reset_index()
callingFeaturesAgg = callingFeatures.groupby(['participant', 'device']).mean().reset_index()

combinedFeatures = screenFeaturesAgg.merge(callingFeaturesAgg, left_on=['participant', 'device'], right_on=['participant', 'device'], how="outer")

In [92]:
data

Unnamed: 0,participant,device,dailysection,date_num_x,recorded_instances,no_of_unlocks,max_screen_on_time,max_screen_off_time,total_screen_on_time,total_screen_off_time,...,con,agree,extra,neuro,open,age,gender,phone,dass,current_diagnosis
0,PROSITC1005,ios,1.0,16.0,93.225806,47.548387,254.322581,374.870968,438.806452,997.870968,...,5.0,8.0,5.0,9.0,6.0,19.0,1.0,1.0,1.0,0.0
1,PROSITC1014,ios,1.0,15.0,115.965517,58.448276,197.241379,456.482759,319.310345,1115.586207,...,7.0,8.0,4.0,9.0,8.0,19.0,0.0,1.0,1.0,0.0
2,PROSITC1016,ios,1.0,4.0,34.142857,18.0,704.571429,317.428571,970.0,469.857143,...,9.0,9.0,9.0,8.0,6.0,21.0,0.0,1.0,2.0,0.0
3,PROSITC1018,ios,1.0,15.0,115.724138,58.551724,309.551724,440.310345,614.931034,822.724138,...,9.0,9.0,6.0,5.0,9.0,25.0,0.0,1.0,3.0,0.0
4,PROSITC1020,ios,1.0,6.3,49.2,25.3,729.9,257.5,910.5,528.4,...,9.0,6.0,5.0,6.0,6.0,24.0,0.0,1.0,2.0,0.0


In [95]:
df = combinedFeatures
df.total_screen_on_time = df.total_screen_on_time/60
df[df.total_screen_on_time > 8]

Unnamed: 0,participant,device,dailysection,date_num_x,recorded_instances,no_of_unlocks,max_screen_on_time,max_screen_off_time,total_screen_on_time,total_screen_off_time,...,no_of_incoming_calls,no_of_outgoing_calls,total_no_of_calls,min_duration_of_single_incoming_call,min_duration_of_single_outgoing_call,max_duration_of_single_incoming_call,max_duration_of_single_outgoing_call,total_duration_of_incoming_calls,total_duration_of_outgoing_calls,total_duration_of_all_calls
6,DND0004,ios,1.0,28.576923,48.096154,24.5,359.865385,490.115385,8.203526,948.269231,...,2.357143,1.142857,4.142857,2.285714,2.321429,9.107143,11.857143,12.071429,13.285714,25.357143
7,DND0005,ios,1.0,20.027027,52.0,26.432432,549.378378,414.216216,11.145946,771.351351,...,2.470588,4.058824,8.941176,1.647059,0.705882,7.529412,10.941176,10.764706,13.823529,24.588235
11,DND1012,ios,1.0,12.304348,17.73913,9.434783,515.695652,572.347826,11.304348,762.130435,...,1.666667,8.333333,13.0,7.666667,0.0,8.333333,15.666667,9.333333,49.666667,59.0
12,DND1013,ios,1.0,34.015152,35.818182,18.484848,408.121212,386.454545,13.418687,634.80303,...,1.485714,1.571429,3.4,1.2,4.428571,3.142857,11.657143,4.514286,12.828571,17.342857
13,DND1014,ios,1.0,19.09375,21.0,10.53125,526.71875,552.8125,10.448438,812.34375,...,0.727273,2.181818,3.181818,1.909091,6.909091,6.181818,15.818182,6.272727,21.272727,27.545455
17,DND1022,ios,1.0,33.0,19.815385,10.076923,443.230769,351.815385,12.470256,692.830769,...,0.941176,1.235294,2.941176,2.823529,2.705882,3.294118,6.0,3.294118,6.470588,9.764706
20,DND1028,ios,1.0,29.5,54.758621,27.913793,254.844828,388.482759,8.519253,928.068966,...,3.8,3.88,10.1,0.42,0.24,6.34,3.14,10.26,4.76,15.02
23,DND1053,ios,1.0,31.822581,22.758065,12.080645,362.83871,394.741935,8.351882,939.983871,...,1.2,2.4,4.7,1.7,4.5,1.8,9.1,2.1,15.1,17.2
25,DND1058,ios,1.0,21.216216,24.351351,12.351351,471.324324,597.810811,9.817117,852.189189,...,0.666667,1.166667,2.333333,6.666667,2.0,7.666667,11.666667,7.666667,19.333333,27.0
26,DND1062,ios,1.0,22.515152,33.454545,16.636364,592.636364,491.545455,12.308081,702.515152,...,1.0,1.0,2.666667,12.0,2.0,12.0,6.333333,12.0,8.333333,20.333333


In [110]:
screenFeaturesAgg[screenFeaturesAgg.participant.str.contains("PROSITC")].reset_index().participant.nunique(), callingFeaturesAgg[callingFeaturesAgg.participant.str.contains("PROSITC")].reset_index().participant.nunique(), combinedFeatures[combinedFeatures.participant.str.contains("PROSITC")].reset_index().participant.nunique()
# df[df['ids'].str.contains("ball")]


(595, 626, 673)

In [70]:
data = combinedFeatures.merge(labels, left_on="participant", right_on="record_id", how="inner")
data = data[~data.dass.isnull()]

featureColumns = ['no_of_unlocks', 'max_screen_on_time',
       'max_screen_off_time', 'total_screen_on_time',
       'first_screen_unlock_time', 'last_screen_lock_time',
       'no_of_missed_calls', 'no_of_rejected_calls', 'no_of_incoming_calls',
       'no_of_outgoing_calls', 'total_no_of_calls',
       'min_duration_of_single_incoming_call',
       'min_duration_of_single_outgoing_call',
       'max_duration_of_single_incoming_call',
       'max_duration_of_single_outgoing_call',
       'total_duration_of_incoming_calls', 'total_duration_of_outgoing_calls',
       'total_duration_of_all_calls']
        #, 'age', 'gender']

# labelColumns = 'current_diagnosis'
labelColumns = 'dass'

X = data.loc[:, featureColumns].fillna(0)
y = data.loc[:, labelColumns].to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.2)

### value counts of train_test split

In [71]:
y_counts = np.unique(y, return_counts=True)[1]
y_train_counts = np.unique(y_train, return_counts=True)[1]
y_test_counts = np.unique(y_test, return_counts=True)[1]

print(f"y       value_counts: {y_counts} \ny_train value_counts: {y_train_counts} {round(y_train_counts[0]*100/y_counts[0])}% \ny_test  value_counts: {y_test_counts} {round(y_test_counts[0]*100/y_counts[0])}%")

y       value_counts: [24 65 80 82] 
y_train value_counts: [19 52 64 65] 79% 
y_test  value_counts: [ 5 13 16 17] 21%


### training and RF model

In [72]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

print("[TP, FP]\n[FN, TN]")

# train prediction
y_pred_train = clf.predict(X_train)
y_train = list(map(int, y_train))
y_pred_train = list(map(int, y_pred_train))
print(f"\naccuracy_train:\n {accuracy_score(y_train, y_pred_train)}")
print(f"\nconfusion_matrix_train:\n {confusion_matrix(y_train, y_pred_train)}")

# test prediction
y_pred_test = clf.predict(X_test)
y_test = list(map(int, y_test))
y_pred_test = list(map(int, y_pred_test))
print(f"\naccuracy_test:\n {accuracy_score(y_test, y_pred_test)}")
print(f"\nconfusion_matrix_train:\n {confusion_matrix(y_test, y_pred_test)}")

[TP, FP]
[FN, TN]

accuracy_train:
 1.0

confusion_matrix_train:
 [[19  0  0  0]
 [ 0 52  0  0]
 [ 0  0 64  0]
 [ 0  0  0 65]]

accuracy_test:
 0.2549019607843137

confusion_matrix_train:
 [[0 0 3 2]
 [1 3 6 3]
 [1 2 4 9]
 [0 5 6 6]]


In [59]:
y_pred_test

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [73]:
feature_names = X_train.columns
importances = clf.feature_importances_
forest_importances = pd.DataFrame([importances], columns=feature_names)
forest_importances = forest_importances.sort_values(by=0, axis=1, ascending=True)

In [74]:
fig = px.bar(forest_importances)
fig.show()

In [55]:
df = callingFeatures.sort_values("total_duration_of_all_calls").reset_index()
fig = px.bar(df, x="participant", y="total_duration_of_all_calls")

for data in fig.data:
    data["width"] = 10
    
fig.show()

In [23]:
screenFeatures.columns

Index(['participant', 'device', 'date', 'dailysection', 'date_num',
       'recorded_instances', 'no_of_unlocks', 'max_screen_on_time',
       'max_screen_off_time', 'total_screen_on_time', 'total_screen_off_time',
       'first_screen_unlock_time', 'last_screen_lock_time'],
      dtype='object')

In [24]:
fig = px.bar(screenFeatures, x="participant", y="no_of_unlocks")

for data in fig.data:
    data["width"] = 10
    
fig.show()

In [31]:
ind = labels[labels.current_diagnosis == 1].index

In [36]:
df = screenFeatures[screenFeatures.index.isin(ind)]

In [41]:
df = df.groupby("participant").mean().reset_index()
# df.shape

In [43]:
# df = screenFeatures[screenFeatures.index in ind].groupby("participant").mean().reset_index()
fig = px.bar(df, x="participant", y="total_screen_on_time")

# for data in fig.data:
#     data["width"] = 10
    
fig.show()

In [27]:
data.columns

Index(['participant', 'device', 'date', 'dailysection', 'date_num',
       'recorded_instances', 'no_of_unlocks', 'max_screen_on_time',
       'max_screen_off_time', 'total_screen_on_time', 'total_screen_off_time',
       'first_screen_unlock_time', 'last_screen_lock_time',
       'no_of_missed_calls', 'no_of_rejected_calls', 'no_of_incoming_calls',
       'no_of_outgoing_calls', 'total_no_of_calls',
       'min_duration_of_single_incoming_call',
       'min_duration_of_single_outgoing_call',
       'max_duration_of_single_incoming_call',
       'max_duration_of_single_outgoing_call',
       'total_duration_of_incoming_calls', 'total_duration_of_outgoing_calls',
       'total_duration_of_all_calls', 'record_id', 'con', 'agree', 'extra',
       'neuro', 'open', 'age', 'gender', 'phone', 'dass', 'current_diagnosis'],
      dtype='object')