In [None]:
#Authors: Tlachac, et al
#Paper: "Automated Construction of Lexicons to Improve Depression Screening with Text Messages"

from empath import Empath
from empath import helpers as util
import pandas as pd
import numpy as np
import re
import string

In [None]:
# importing libraries and empath class 
from collections import defaultdict
import os
import sys
from collections import defaultdict
import requests
import json
class Empath:
    def __init__(self, backend_url="http://54.148.189.209:8000"):
        self.cats = defaultdict(list)
        self.staging = {}
        self.backend_url = backend_url
        self.base_dir = os.path.dirname(util.__file__)
        #print(self.base_dir)
        self.inv_cache = {}
        #loads the default empath categories 
        self.load(self.base_dir+"/data/categories.tsv")
        # loads the user-written categories 
        for f in os.listdir(self.base_dir+"/data/user/"):
            if len(f.split(".")) > 1 and f.split(".")[1] == "empath":
                self.load(self.base_dir+"/data/user/"+f)
    def load(self,file):
        with open(file,"r") as f:
            for line in f:
                cols = line.strip().split("\t")
                name = cols[0]
                terms = cols[1:]
                for t in set(terms):
                    self.cats[name].append(t)
                    #self.invcats[t].append(name)
    def analyze_term_window(self,doc,targets,categories=None,window_size=10,normalize=False):
        tokenizer = util.window_tokenizer(window_size,targets)
        return self.analyze(doc,categories,tokenizer,normalize)
    def analyze(self,doc,categories=None,tokenizer="default",normalize=False):
        if isinstance(doc,list):
            doc = "\n".join(doc)
        if tokenizer == "default":
            tokenizer = util.default_tokenizer
        elif tokenizer == "bigrams":
            tokenizer = util.bigram_tokenizer
        if not hasattr(tokenizer,"__call__"):
            raise Exception("invalid tokenizer")
        if not categories:
            categories = self.cats.keys()
        invcats = defaultdict(list)
        key = tuple(sorted(categories))
        if key in self.inv_cache:
            invcats = self.inv_cache[key]
        else:
            for k in categories:
                for t in self.cats[k]: invcats[t].append(k)
            self.inv_cache[key] = invcats
        count = {}
        tokens = 0.0
        for cat in categories: count[cat] = 0.0
        for tk in tokenizer(doc):
            tokens += 1.0
            for cat in invcats[tk]:
                count[cat]+=1.0
        if normalize:
            for cat in count.keys():
                if tokens == 0:
                    return None
                else:
                    count[cat] = count[cat] / tokens
        return count
    def create_category(self,name,seeds,model="fiction",size=100,write=True):
        resp = requests.post(self.backend_url + "/create_category", json={"terms":seeds,"size":size,"model":model})
        return(resp.text)
        print(resp.text)
        results = json.loads(resp.text)
        self.cats[name] = list(set(results))
        if write:
            with open(self.base_dir+"/data/user/"+name+".empath","w") as f:
                f.write("\t".join([name]+results))
    def delete_category(self,name):
        if name in self.cats: del self.cats[name]
        filename = self.base_dir+"/data/user/"+name+".empath"
        if os.path.isfile(filename):
            os.remove(filename)

In [None]:
# creates "wordsToUse" which are seed words for each category 

# Open the category file 
catFile = open("categories.tsv", "r", encoding="utf-8")

# variable to store the seed words that I created 
wordList = []
wordsToUse = []
nWords = []

j = 0 
# loops through each line in the file 
for line in catFile: 
    categoryList = line.strip("\n").split("\t")
    
    nWords.append(len(categoryList))
    categoryName = categoryList[0]
    wordList.append(categoryName)
    wordsToUse.append(categoryList[1:4])

In [None]:
lexicon = Empath()
savelist = []
# Create own categories (using reddit model)
myReddit5C = zip(wordList, wordsToUse, nWords)
for wlist, wordsused, nwords in list(myReddit5C):
    print(wlist)
    #print(wordsused)
    #print(nwords)
    #print("_________________________")
    save = lexicon.create_category(wlist, wordsused, model = "fiction", size = nwords)
    tempsave = []
    nCounter = 5
    for indword in save[1:-1].split(","):
        temp = indword.replace('"', "").strip()
        #remove punctuation
        tempsave.append(re.sub(r'[^\w\s]', '', str(temp).lower()))
    for indseed in wordsused:
        if indseed in tempsave:
            nCounter = nCounter - 1
            #print(nCounter)
        else:
            tempsave.insert(0, indseed)    
    tempsave.insert(0, wlist)
    savelist.append(list(set(tempsave)))

In [None]:
savelist

In [None]:
# read the data frame 
dataFrame = pd.read_csv("sent14days.csv")
#print(dataFrame)

# obtain ID, texts, scores, and address 2 from dataframe
dataFrame = dataFrame[['id', 'body2', 'scores', 'address2']]
dataFrame.head()

In [None]:
# Group data frame by participant ID 
def sortList(inList):
    newList = " ".join(inList)
    return newList

def scoreList(inList): 
    #print(list(inList))
    if len(inList) == 0: 
        return None
    else: 
        return list(inList)[0]

#function to clean out punctuation and make text lowercase
def cleanText(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation,"")
    return text.lower()

newCol = dataFrame[['id', 'body2']].groupby(by = "id").agg(len)

dataFrame = dataFrame.sort_values(by="id")
dataFrame = dataFrame.groupby(by = "id").agg({"body2":sortList, "scores": scoreList})
dataFrame['NumTexts'] = newCol

#only 2+ messages
dataFrame = dataFrame[dataFrame['NumTexts'] >= 2]

dataFrame.head()

In [None]:
dataFrame.reset_index(inplace=True)

#actively remove punctuation and make lower case for text messages
dataFrame["body2"] = dataFrame["body2"].apply(cleanText)

dataFrame.head()

In [None]:
print(dataFrame.shape)

In [None]:
#walk through each word
for j in range(0, len(savelist)):

    frequencies = []     
        
    #walk through each participant
    for i in range(0,dataFrame.shape[0]):
        body = dataFrame.body2[i]
        #n words in body
        bodylen = len(body.split(" "))
    
        c = 0
        for w in savelist[j]: 
            #skip over empty strings that used to be punctuation
            if w != "":
                c = c + body.count(" " + w + " ")
                
        frequencies.append(c/bodylen)
    dataFrame[wordList[j]] = frequencies

In [None]:
dataFrame.head()

In [None]:
dataFrame.to_csv("featuresFiction3f.csv")

In [None]:
newDF = pd.DataFrame()
newDF["categories"] = wordList
newDF["seedwords"] = wordsToUse
newDF["words"] = savelist

In [None]:
newDF

In [None]:
newDF.to_csv("wordsFiction3f.csv")