In [None]:
#Authors: Tlachac, et al
#Paper: "Automated Construction of Lexicons to Improve Depression Screening with Text Messages"

from empath import Empath
from empath import helpers as util
import pandas as pd
import numpy as np
import re
import string

In [None]:
participants = ['e1526', 'e2806', 'e3702', 'e4894', 'e5006', 'e5199', 'e9137', 'e9237', 'e9513', 'e9766', 'm1087', 'm12', 'm1206', 'm1226', 'm1281', 'm1399', 'm1487', 'm1506', 'm1762', 'm1836', 'm1957', 'm1960', 'm2109', 'm2132', 'm2173', 'm2185', 'm2216', 'm2331', 'm2374', 'm2402', 'm2561', 'm2564', 'm2892', 'm2993', 'm3234', 'm3552', 'm3616', 'm3810', 'm4088', 'm4238', 'm4368', 'm4435', 'm4506', 'm4580', 'm473', 'm4814', 'm4937', 'm4947', 'm5223', 'm5464', 'm5487', 'm5666', 'm5882', 'm5926', 'm5989', 'm6499', 'm676', 'm6951', 'm7101', 'm7123', 'm7124', 'm7237', 'm7340', 'm746', 'm7490', 'm7856', 'm7861', 'm7974', 'm8075', 'm8080', 'm8131', 'm8338', 'm8555', 'm8595', 'm8676', 'm8833', 'm8921', 'm8979', 'm9014', 'm9019', 'm9074', 'm9301', 'm9505', 'm9525', 'm9751', 'm9886', 'm9928', 'm9984']

In [None]:
# read the data frames
dataFrame = pd.read_csv("sent14days.csv")

# obtain ID, texts, scores, and address 2 from dataframe
dataFrame = dataFrame[['id', 'body2', 'scores', 'address2']]

dataFrame = dataFrame[dataFrame['id'].isin(participants)].reset_index()
dataFrame['body2'] = dataFrame['body2'].astype(str)

# Group data frame by participant ID 
def sortList(inList):
    newList = " ".join(inList)
    return newList

def scoreList(inList): 
    #print(list(inList))
    if len(inList) == 0: 
        return None
    else: 
        return list(inList)[0]

#function to clean out punctuation and make text lowercase
def cleanText(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation,"")
    return text.lower()

newCol = dataFrame[['id', 'body2']].groupby(by = "id").agg(len)

dataFrame = dataFrame.sort_values(by="id")
dataFrame = dataFrame.groupby(by = "id").agg({"body2":sortList, "scores": scoreList})
dataFrame['NumTexts'] = newCol

dataFrame.reset_index(inplace=True)

#actively remove punctuation and make lower case for text messages
dataFrame["body2"] = dataFrame["body2"].apply(cleanText)

print(dataFrame.shape)
dataFrame.head()

In [None]:
#create list of all empath categories

lexicon = Empath()
emp = lexicon.analyze("Testing", normalize=True)
wordlist = []
for word, value in emp.items():
    wordlist.append(word)
print(wordlist)

In [None]:
paras = ["1c", "1f", "1r", "3c", "3f", "3r", "5c", "5f", "5r"]

In [None]:
for par in paras:
    print(par)
    lexF = pd.read_csv("wordsFiction" + par + ".csv")
    lexR = pd.read_csv("wordsReddit" + par + ".csv")
    lexN = pd.read_csv("wordsNews" + par + ".csv")

    #walk through each category
    for j in range(0, len(lexF.words)):
    
        savelistF = lexF.words[j][1:-1].split(", ")
        savelistF = [s[1:-1] for s in savelistF]
        
        savelistR = lexR.words[j][1:-1].split(", ")
        savelistR = [s[1:-1] for s in savelistR]
        
        savelistN = lexN.words[j][1:-1].split(", ")
        savelistN = [s[1:-1] for s in savelistN]
        
        savelist = list(set(savelistF + savelistR + savelistN))

        print(savelist)
        
        frequencies = []
        #walk through each participant
        for i in range(0,dataFrame.shape[0]):
            body = dataFrame.body2[i]
            #n words in body
            bodylen = len(body.split(" "))

            c = 0
            for w in savelist: 
                #skip over empty strings that used to be punctuation
                if w != "":
                    c = c + body.count(" " + w + " ")

            frequencies.append(c/bodylen)
        dataFrame[wordlist[j]] = frequencies
    dataFrame.to_csv("featuresCombined" + par + ".csv")