In [49]:
#code author: ML Tlachac
#paper title: 'Screening for Depression with Retrospectively Harvested Private versus Public Text' 
#paper accessible at: https://ieeexplore-ieee-org.ezpxy-web-p-u01.wpi.edu/document/9049136
#github: github.com/mltlachac/IEEEjBHI2020
#https://ieeexplore-ieee-org.ezpxy-web-p-u01.wpi.edu/document/9049136

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
import textblob as tb
import json

In [50]:
modality = "Tweets"
ndays = 56

#load Moodable data
dft1 = pd.read_csv('dataTweetMoodableCommaQuoteWindows.csv', encoding = "utf-8")
dft1 = dft1.dropna().reset_index()
print("Moodable: " + str(dft1.shape))

#attach date data collected
dfids1 = pd.read_csv("idsMoodableCommaQuoteWindows.csv")
dft1 = pd.merge(dfids1, dft1, on = "id")
dft1 = dft1.reset_index()

#attach PHQ-9 scores
dft19 = pd.read_csv('phqsMoodableCommaQuoteWindows.csv')
print("PHQ: " + str(dft19.shape))

scores = []
for i in range(0, dft19.shape[0]):
    content = dft19.content[i][1:-1].split(",")
    summation = 0
    for j in range(0, 9):
        summation = summation + int(content[j][-2])
    scores.append(summation)
dft19["scores"] = scores

#limit to unique users, highest score is used
idlist = []
newScores = []
for i in set(dft19["id"]):
    tempdf = dft19[dft19["id"] == i].reset_index()
    idlist.append(tempdf["id"][0])
    if tempdf.shape[0] > 1:
        newScores.append(max(tempdf["scores"]))
    else:
        newScores.append(tempdf.scores[0])
unique19 = pd.DataFrame()
unique19["id"] = idlist
unique19["scores"] = newScores
print("unique PHQ: " + str(unique19.shape))

dft1 = pd.merge(dft1, unique19, on = "id")
print("Moodable with PHQ: " + str(dft1.shape))

#make ids represent dataset
newids = []
for m in range(0, dft1.shape[0]):
    newids.append("m" + str(dft1["id"][m]))
dft1["id"] = newids

#Load EMU data
dft2 = pd.read_csv('dataTweetEMUCommaQuoteWindows.csv', encoding = "utf-8")
dft2 = dft2.dropna().reset_index()
print("EMU: " + str(dft2.shape))

#attach date data collected, limit EMU participants to last session with each phone
dfids2start = pd.read_csv('idsEMUCommaQuoteWindows.csv', encoding = "utf-8")
dfids2 = pd.DataFrame()
dfids2["id"] = dfids2start.sessionid
dfids2["date"] = dfids2start.date
dfids2["paid"] = dfids2start.paid
dft2 = pd.merge(dfids2, dft2, on = "id")
dft2 = dft2[dft2.paid == 2]
df2t = dft2.reset_index()
print("EMU last session: " + str(dft2.shape))

#attach PHQ-9 scores
dft29 = pd.read_csv('phqsEMUCommaQuoteWindows.csv')
print("PHQ: " + str(dft29.shape))

scores = []
for i in range(0, dft29.shape[0]):
    content = dft29.content[i][1:-1].split(",")
    summation = 0
    for j in range(0, 9):
        summation = summation + int(content[j][-2])
    scores.append(summation)
dft29["scores"] = scores

#limit to unique users, highest score is used
idlist = []
newScores = []
for i in set(dft29["id"]):
    tempdf = dft29[dft29["id"] == i].reset_index()
    idlist.append(tempdf["id"][0])
    if tempdf.shape[0] > 1:
        newScores.append(max(tempdf["scores"]))
    else:
        newScores.append(tempdf.scores[0])
        
unique29 = pd.DataFrame()
unique29["id"] = idlist
unique29["scores"] = newScores
print("unique PHQ: " + str(unique29.shape))

dft2 = pd.merge(dft2, unique29, on = "id")
print("EMU with PHQ: " + str(dft2.shape))

#make ids represent dataset
newids = []
for e in range(0, dft2.shape[0]):
    newids.append("e" + str(dft2["id"][e]))
dft2["id"] = newids

Moodable: (68328, 4)
PHQ: (510, 3)
unique PHQ: (501, 2)
Moodable with PHQ: (66322, 7)
EMU: (17058, 4)
EMU last session: (17058, 6)
PHQ: (126, 3)
unique PHQ: (115, 2)
EMU with PHQ: (17058, 7)


In [51]:
dft1 = dft1.drop(["level_0"], axis = 1)
dft2 = dft2.drop(["paid"], axis = 1)

#Combine datasets
dft = dft1.append(dft2)
dft = dft.drop(["index"], axis = 1)
print("Combined: " + str(dft.shape))
#remove duplicated data instances
dft = dft.drop_duplicates()
dft = dft.reset_index()
print(dft.shape)

#extract information from data instance metadata
jsonExtract = []
for i in range(0, len(dft.content)):
    jsonExtract.append(json.loads(str(dft.content[i])))
names = list(jsonExtract[0])
jsonDF = pd.DataFrame()
for n in names:
    nlist = []
    for i in range(0, len(dft.content)):
        if n in list(jsonExtract[i]):
            nlist.append(jsonExtract[i][n])
        else:
            nlist.append("-100")
    jsonDF[n+"2"] = nlist
dft = pd.concat([dft, jsonDF], axis = 1)#, sort = False)

Combined: (83380, 5)
(77309, 6)


In [52]:
#Limit data to ndays
from datetime import datetime, timedelta
from dateutil import parser

indexes = []
for i in range(0, dft.shape[0]):
    timeEnd = datetime.fromtimestamp(dft.date[i]/1000)
    timeEnd = timeEnd.replace(year = 2017)
    timeStart = timeEnd - timedelta(days=ndays)
    #timeCurrent = datetime.fromtimestamp(dft["date.1"][i]/100)
    timeCurrent = parser.parse(dft.created_at2[i])
    timeCurrent = timeCurrent.replace(tzinfo = None)
    diff = (timeStart-timeCurrent).days
    if diff>0: #will be dropped
        indexes.append(i)

print(dft.shape)
dft = dft.drop(indexes)
print(dft.shape)

print("Number of Messages: " + str(dft.shape[0]))
print("Number of Participants: " + str(len(set(dft["id"]))))

(77309, 31)
(36455, 31)
Number of Messages: 36455
Number of Participants: 134


In [53]:
#Tweet Participant by ID

pDFt = pd.DataFrame()
pID = []
pContent = []
nTweets = []
score = []
for i in set(dft["id"]):
    tempdf = dft[dft["id"] == i].reset_index()
    pID.append(i)
    score.append(tempdf.scores[0])
    p = []
    for j in range(0, tempdf.shape[0]):
        p.append(tempdf["full_text2"][j])
    pContent.append(p)
    nTweets.append(len(p))
pDFt["ID"] = pID
pDFt["Content"] = pContent
pDFt["Messages"] = nTweets
pDFt["Score"] = score

In [54]:
#tweet POS tags and sentiment

polarity = []
subjectivity = []
tags = []
for i in range(0, len(pDFt.ID)):
    print(i)
    polarity2 = []
    subjectivity2 = []
    tags2 = []
    for text in pDFt.Content[i]:
        T = TextBlob(str(text))
        polarity2.append(T.sentiment[0])
        subjectivity2.append(T.sentiment[1])
        for word, tag in T.tags:
            tags2.append(tag)
    tags.append(tags2)
    polarity.append(polarity2)
    subjectivity.append(subjectivity2)
pDFt["POStags"] = tags
pDFt["Polarity"] = polarity
pDFt["Subjectivity"] = subjectivity

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133


In [55]:
#volume features for tweets
words = []
char = []
for i in range(0, pDFt.shape[0]):
    w = []
    c = []
    for tweet in pDFt.Content[i]:
        w.append(len(tweet.split(" ")))
        c.append(len(tweet))
    words.append(w)
    char.append(c)
pDFt["Words"] = words
pDFt["Characters"] = char

In [56]:
from empath import Empath
import re

#create list of all empath categories

lexicon = Empath()
emp = lexicon.analyze("Testing", normalize=True)
wordlist = []
for word, value in emp.items():
    wordlist.append(word)
print(wordlist)

['real_estate', 'alcohol', 'anticipation', 'giving', 'violence', 'shopping', 'beauty', 'politeness', 'love', 'deception', 'musical', 'noise', 'kill', 'envy', 'valuable', 'vacation', 'breaking', 'hygiene', 'cold', 'anger', 'appearance', 'fear', 'nervousness', 'family', 'legend', 'fight', 'timidity', 'art', 'sadness', 'home', 'power', 'surprise', 'listen', 'technology', 'strength', 'computer', 'smell', 'fabric', 'anonymity', 'health', 'shame', 'leader', 'urban', 'college', 'play', 'cooking', 'swimming', 'plant', 'ship', 'phone', 'furniture', 'royalty', 'fun', 'monster', 'warmth', 'death', 'prison', 'worship', 'hipster', 'morning', 'poor', 'work', 'dispute', 'friends', 'water', 'neglect', 'hiking', 'disappointment', 'medieval', 'weapon', 'vehicle', 'help', 'ocean', 'independence', 'party', 'order', 'white_collar_job', 'sleep', 'messaging', 'traveling', 'ancient', 'sexual', 'youth', 'emotional', 'dance', 'school', 'torment', 'disgust', 'exercise', 'contentment', 'celebration', 'hate', 'pet

In [57]:
#Empath features for Tweets

for word in wordlist:
    pctt = []
    for i in range(0, pDFt.shape[0]):
        content = re.sub(r'[^\w\s]', '', str(pDFt.Content[i]).lower())
        lexicon = Empath()
        emp = lexicon.analyze(content, categories=[word], normalize = True)
        if emp != None:
            for key, value in emp.items():
                pctt.append(value)
        else:
            pctt.append(0)
    pDFt[word] = pctt

In [58]:
#create new category
lexicon.create_category("text_abbreviations",["lol","ttyl","brb"], model="reddit")

["ttyl", "brb", "brb", "lol", "lol", "welp", "lolol", "lol", "lolz", "lol", "lololol", "lol", "cuz", "wth", "lawl", "jk", "loool", "srsly", "lol", "Lol", "imma", "lmfao", "imma", "Jk", "lolololol", "lmao", "ikr", "cya", "imma", "Lolol", "nah", "hahaha", "lolol", "cus", "brb", "hahah", "ahaha", "aaand", "rn", "lool", "jk", "sorry_guys", "-_-", "mkay", "loll", "guyz", "fml", "Lol", "lol", "dw", "jk", "btw", "Lmao", "omg", "brb", "imma", "-__-", "ahahaha", "wtf", "omg", "lol", "nvm", "Bro", "srry", ";p", "shiz", "hahahahaha", "aight", "naw", "ummm", "Ikr", "Brb", "lol", "sorry_bro", "Lololol", "ahha", "jk", "guyz", "aaaand", "smh", "bruh", "u", "Bruh", "hahaha", "ight", "cuz", "rofl", "Welp", "dw", ":p", "cuz", "ur", "aaaaand", "haha", "yea", "gtg", "cuz", ".now", "hey", "brb"]


In [59]:
#new catergory for tweets
pctt = []
empatht = []
for i in range(0, pDFt.shape[0]):
    content = re.sub(r'[^\w\s]', '', str(pDFt.Content[i]).lower())
    empatht.append(len(content.split(" ")))
    lexicon = Empath()
    emp = lexicon.analyze(content, categories=["text_abbreviations"], normalize = True)
    if emp != None:
        for key, value in emp.items():
            pctt.append(value)
    else:
        pctt.append(0)
pDFt["text_abbreviations"] = pctt
pDFt["WordsEmpath"] = empatht

In [60]:
#get set of POS tags
posTags = []
for i in range(0, pDFt.shape[0]):
    for tag in pDFt.POStags[i]:
        posTags.append(tag)
posSet = set(posTags)
print(posSet)

{'WP$', 'JJ', 'NNS', 'TO', 'JJS', 'RBR', 'LS', 'NN', 'PRP', 'IN', 'UH', 'CD', 'PDT', 'VBG', 'VBD', 'MD', 'SYM', 'NNP', 'JJR', 'POS', 'WRB', 'VB', 'VBP', 'FW', 'CC', 'VBZ', 'PRP$', 'RP', 'WP', 'RBS', 'WDT', 'DT', 'EX', 'VBN', 'RB', 'NNPS'}


In [61]:
#POS tag counting for tweets

poswordst = []
for posList in pDFt.POStags:
    poswordst.append(len(posList))
pDFt["WordsTags"] = poswordst

for tag in posSet:
    cnt = []
    for posList in pDFt.POStags:
        counter = 0
        for item in posList:
            if item == tag:
                counter += 1
        cnt.append(counter)
    pDFt[tag] = cnt

In [62]:
#sentiment features for tweets

pcount = []
ncount = []
pstd = []
nstd = []
pavg = []
navg = []
scount = []
sstd = []
savg = []
for i in range(0, pDFt.shape[0]):
    s = []
    p = []
    n = []
    for item in pDFt.Polarity[i]:
        if item > 0:
            p.append(item)
        if item < 0:
            n.append(item)
    for item in pDFt.Subjectivity[i]:
        if item > 0:
            s.append(item)
    pcount.append(len(p))
    ncount.append(len(n))
    scount.append(len(s))
    if len(p) > 0:
        pavg.append(sum(p)/len(p))
        pstd.append(np.std(p))
    else:
        pavg.append(0)
        pstd.append(0)
    if len(n) > 0:
        navg.append(sum(n)/len(n))
        nstd.append(np.std(n))
    else:
        navg.append(0)
        nstd.append(0)
    if len(s) > 0:
        savg.append(sum(s)/len(s))
        sstd.append(np.std(s))
    else:
        savg.append(0)
        sstd.append(0)
pDFt["PositiveCnt"] = pcount
pDFt["NegativeCnt"] = ncount
pDFt["PositiveStd"] = pstd
pDFt["NegativeStd"] = nstd
pDFt["PostitiveAvg"] = pavg
pDFt["NegativeAvg"] = navg
pDFt["SubjectiveCnt"] = scount
pDFt["SubjectiveStd"] = sstd
pDFt["SubjectiveAvg"] = savg

In [63]:
#Volume features for Tweets
wsum = []
wavg = []
wstd = []
csum = []
cavg = []
cstd = []
unique = []

for i in range(0, pDFt.shape[0]):
    wsum.append(sum(pDFt.Words[i]))
    wavg.append(sum(pDFt.Words[i])/len(pDFt.Words[i]))
    wstd.append(np.std(pDFt.Words[i]))
    csum.append(sum(pDFt.Characters[i]))
    cavg.append(sum(pDFt.Characters[i])/len(pDFt.Characters[i]))
    cstd.append(np.std(pDFt.Characters[i]))
    unique.append(len(list(set(pDFt.Content[i]))))

pDFt["WordSum"] = wsum
pDFt["WordAvg"] = wavg
pDFt["WordStd"] = wstd
pDFt["CharacterSum"] = csum
pDFt["CharacterAvg"] = cavg
pDFt["CharacterStd"] = cstd
pDFt["UniqueCnt"] = unique

In [64]:
saveDFtv = pDFt.drop(columns = ["Content", "POStags", "Polarity", "Subjectivity", "Words", "Characters"])
saveDFtv.to_csv("preprocessed" + modality + str(ndays) + "days.csv", encoding = "utf-8")