In [1]:
#Author: ML Tlachac
#Paper: Depression Screening from Text Message Reply Latency
#year: 2020
#github.com/mltlachac/EMBC2020

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import nltk

from datetime import datetime, timedelta
from dateutil import parser

In [2]:
modality = "Texts"
ndays = 14 #14 #28, #42, #364

#load Moodable data
dft1 = pd.read_csv('dataTextMoodableCommaQuoteWindows.csv', encoding = "utf-8")
dft1 = dft1.dropna().reset_index()
print("Moodable: " + str(dft1.shape))

#attach date data collected
dfids1 = pd.read_csv("idsMoodableCommaQuoteWindows.csv")
dft1 = pd.merge(dfids1, dft1, on = "id")
dft1 = dft1.reset_index()

#attach PHQ-9 scores
dft19 = pd.read_csv('phqsMoodableCommaQuoteWindows.csv')
print("PHQ: " + str(dft19.shape))

scores = []
for i in range(0, dft19.shape[0]):
    content = dft19.content[i][1:-1].split(",")
    summation = 0
    for j in range(0, 9):
        summation = summation + int(content[j][-2])
    scores.append(summation)
dft19["scores"] = scores

#limit to unique users, highest score is used
idlist = []
newScores = []
for i in set(dft19["id"]):
    tempdf = dft19[dft19["id"] == i].reset_index()
    idlist.append(tempdf["id"][0])
    if tempdf.shape[0] > 1:
        newScores.append(max(tempdf["scores"]))
    else:
        newScores.append(tempdf.scores[0])
unique19 = pd.DataFrame()
unique19["id"] = idlist
unique19["scores"] = newScores
print("unique PHQ: " + str(unique19.shape))

dft1 = pd.merge(dft1, unique19, on = "id")
print("Moodable with PHQ: " + str(dft1.shape))

#make ids represent dataset
newids = []
for m in range(0, dft1.shape[0]):
    newids.append("m" + str(dft1["id"][m]))
dft1["id"] = newids

#Load EMU data
dft2 = pd.read_csv('dataTextEMUCommaQuoteWindows.csv', encoding = "utf-8")
dft2 = dft2.dropna().reset_index()
print("EMU: " + str(dft2.shape))

#attach date data collected, limit EMU participants to last session with each phone
dfids2start = pd.read_csv('idsEMUCommaQuoteWindows.csv', encoding = "utf-8")
dfids2 = pd.DataFrame()
dfids2["id"] = dfids2start.sessionid
dfids2["date"] = dfids2start.date
dfids2["paid"] = dfids2start.paid
dft2 = pd.merge(dfids2, dft2, on = "id")
dft2 = dft2[dft2.paid == 2]
df2t = dft2.reset_index()
print("EMU last session: " + str(dft2.shape))

#attach PHQ-9 scores
dft29 = pd.read_csv('phqsEMUCommaQuoteWindows.csv')
print("PHQ: " + str(dft29.shape))

scores = []
for i in range(0, dft29.shape[0]):
    content = dft29.content[i][1:-1].split(",")
    summation = 0
    for j in range(0, 9):
        summation = summation + int(content[j][-2])
    scores.append(summation)
dft29["scores"] = scores

#limit to unique users, highest score is used
idlist = []
newScores = []
for i in set(dft29["id"]):
    tempdf = dft29[dft29["id"] == i].reset_index()
    idlist.append(tempdf["id"][0])
    if tempdf.shape[0] > 1:
        newScores.append(max(tempdf["scores"]))
    else:
        newScores.append(tempdf.scores[0])
        
unique29 = pd.DataFrame()
unique29["id"] = idlist
unique29["scores"] = newScores
print("unique PHQ: " + str(unique29.shape))

dft2 = pd.merge(dft2, unique29, on = "id")
print("EMU with PHQ: " + str(dft2.shape))

#make ids represent dataset
newids = []
for e in range(0, dft2.shape[0]):
    newids.append("e" + str(dft2["id"][e]))
dft2["id"] = newids

Moodable: (308498, 4)
PHQ: (510, 3)
unique PHQ: (501, 2)
Moodable with PHQ: (261278, 7)
EMU: (13438, 4)
EMU last session: (12961, 6)
PHQ: (126, 3)
unique PHQ: (115, 2)
EMU with PHQ: (12961, 7)


In [3]:
dft1 = dft1.drop(["level_0"], axis = 1)
dft2 = dft2.drop(["paid"], axis = 1)

#Combine datasets
dft = dft1.append(dft2)
dft = dft.drop(["index"], axis = 1)
print("Combined: " + str(dft.shape))
#remove duplicated data instances
dft = dft.drop_duplicates()
dft = dft.reset_index()
print(dft.shape)

#extract information from data instance metadata
jsonExtract = []
for i in range(0, len(dft.content)):
    jsonExtract.append(json.loads(str(dft.content[i])))
names = list(jsonExtract[0])
jsonDF = pd.DataFrame()
for n in names:
    nlist = []
    for i in range(0, len(dft.content)):
        if n in list(jsonExtract[i]):
            nlist.append(jsonExtract[i][n])
        else:
            nlist.append("-100")
    jsonDF[n+"2"] = nlist
dft = pd.concat([dft, jsonDF], axis = 1)#, sort = False) 

Combined: (274239, 5)
(267735, 6)


In [4]:
print(dft.columns)

Index(['index', 'id', 'date', 'type', 'content', 'scores', 'person2',
       'safe_message2', 'from_address2', 'group_type2', 'seen2',
       'service_center2', 'announcements_scenario_id2', 'svc_cmd2', 'date2',
       'thread_id2', 'subject2', 'd_rpt_cnt2', 'protocol2', 'address2',
       'hidden2', 'type2', 'error_code2', 'reserved2', 'read2',
       'reply_path_present2', 'locked2', 'sim_slot2', 'date_sent2',
       'announcements_subtype2', '_id2', 'pri2', 'sim_imsi2', 'favorite2',
       'device_name2', 'app_id2', 'secret_mode2', 'link_url2', 'spam_report2',
       'svc_cmd_content2', 'status2', 'msg_id2', 'delivery_date2', 'sub_id2',
       'body2', 'callback_number2', 'teleservice_id2', 'group_id2',
       'deletable2', 'using_mode2', 'creator2', 'roam_pending2'],
      dtype='object')


In [5]:
#participant-address combinations

#remove data instances with no date
dft = dft[dft.date2 != "-1"]
print(dft.shape)
dft = dft.reset_index()

#Limit data to ndays
from datetime import datetime, timedelta
from dateutil import parser

indexes = []
for i in range(0, dft.shape[0]):
    timeEnd = datetime.fromtimestamp(dft.date[i]/1000)
    timeStart = timeEnd - timedelta(days=ndays)
    timeCurrent = datetime.fromtimestamp(int(dft["date2"][i])/1000)
    diff = (timeStart-timeCurrent).days
    if diff>0: #will be dropped
        indexes.append(i)

print(dft.shape)
dft = dft.drop(indexes)
dft = dft.drop(["level_0"], axis = 1)
dft = dft.reset_index()
print(dft.shape)

print("Number of Messages: " + str(dft.shape[0]))
print("Number of Participants: " + str(len(set(dft["id"]))))

#sort df by date
dft = dft.sort_values(by = 'date2')

#extract relevant data
idlist = []
addresslist = []
datelist = []
typelist = []
bodylist = []
scorelist = []
for p in set(dft["id"]):
    tempdf = dft[dft["id"]==p]
    addresses = list(set(tempdf["address2"]))
    for i in range(0, len(addresses)):
        idlist.append(p)
        addresslist.append(i)
        temptempdf = tempdf[tempdf["address2"]==addresses[i]]
        datelist.append(temptempdf["date2"].tolist())
        typelist.append(temptempdf["type2"].tolist())
        bodylist.append(temptempdf["body2"].tolist())
        scorelist.append(tempdf.scores.tolist()[0])

newdf = pd.DataFrame()
newdf["id"] = idlist
newdf["address"] = addresslist
newdf["date"] = datelist
newdf["type"] = typelist
newdf["body"] = bodylist
newdf["score"] = scorelist

(267732, 52)
(267732, 53)
(44601, 53)
Number of Messages: 44601
Number of Participants: 319


In [6]:
#limit to conversations with responses
print("All conversations: " + str(newdf.shape[0]))
ids = []
for i in range(0, newdf.shape[0]):
    if "1" in newdf["type"][i]:
        start = newdf["type"][i].index("1") + 1
        afterlist = newdf["type"][i][start:]
        if "2" in afterlist:
            ids.append(i)

newdf = newdf.ix[ids]
newdf = newdf.reset_index()
print("Conversations with responses: " + str(newdf.shape[0]))

All conversations: 5606
Conversations with responses: 337


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [7]:
#extract latency of responses

df = newdf
latency = []
count = 0

for i in range(0, df.shape[0]):
    latlist = []
    for j in range(1, len(df['date'][i])):
        if df['type'][i][j] == '2':
            if df['type'][i][j-1] == "1":
                first = datetime.fromtimestamp(int(df['date'][i][j-1])/1000)
                second = datetime.fromtimestamp(int(df['date'][i][j])/1000)
                diff = (second-first).seconds
                latlist.append(diff)
                count += 1
    latency.append(latlist)
                
df["latency"] = latency
print("Number of responses: " + str(count))

Number of responses: 3861


In [8]:
#group by participant

idlist = []
parlist = []
latencylist = []
score = []

for p in set(df['id']):
    tempdf = df[df['id'] == p]
    tempdf = tempdf.reset_index()
    idlist.append(p)
    score.append(tempdf['score'][0])
    parlist.append(tempdf.shape[0])
    lat = []
    for i in range(0, tempdf.shape[0]):
        for j in tempdf.latency[i]:
            lat.append(j)
    latencylist.append(lat)

newdf = pd.DataFrame()
newdf['id'] = idlist
newdf['contacts'] = parlist
newdf['latency'] = latencylist
newdf['score'] = score
print("Participants: " + str(newdf.shape[0]))

Participants: 80


In [9]:
#extract latency features

responses = []
minlist = []
quant10 = []
quant25 = []
quant50 = []
quant75 = []
quant90 = []
maxlist = []
median = []

for i in range(0, newdf.shape[0]):
    responses.append(len(newdf.latency[i]))
    minlist.append(min(newdf.latency[i]))
    quant10.append(np.quantile(newdf.latency[i], 0.1))
    quant25.append(np.quantile(newdf.latency[i], 0.25))
    quant50.append(np.quantile(newdf.latency[i], 0.5))
    quant75.append(np.quantile(newdf.latency[i], 0.75))
    quant90.append(np.quantile(newdf.latency[i], 0.9))
    maxlist.append(max(newdf.latency[i]))
    median.append(np.mean(newdf.latency[i]))
    
newdf['responses'] = responses
newdf['min'] = minlist
newdf['quant10'] = quant10
newdf['quant25'] = quant25
newdf['quant50'] = quant50
newdf['quant75'] = quant75
newdf['quant90'] = quant90
newdf['max'] = maxlist
newdf['mean'] = median

In [10]:
newdf.to_csv("Latency" + modality + str(ndays) + ".csv")