In [1]:
# importing empath 
from empath import Empath
from empath import helpers as util

In [2]:
# importing libraries and empath class 
from collections import defaultdict
import os
import sys
from collections import defaultdict
import requests
import json
class Empath:
    def __init__(self, backend_url="http://54.148.189.209:8000"):
        self.cats = defaultdict(list)
        self.staging = {}
        self.backend_url = backend_url
        self.base_dir = os.path.dirname(util.__file__)
        #print(self.base_dir)
        self.inv_cache = {}
        #loads the default empath categories 
        self.load(self.base_dir+"/data/categories.tsv")
        # loads the user-written categories 
        for f in os.listdir(self.base_dir+"/data/user/"):
            if len(f.split(".")) > 1 and f.split(".")[1] == "empath":
                self.load(self.base_dir+"/data/user/"+f)
    def load(self,file):
        with open(file,"r") as f:
            for line in f:
                cols = line.strip().split("\t")
                name = cols[0]
                terms = cols[1:]
                for t in set(terms):
                    self.cats[name].append(t)
                    #self.invcats[t].append(name)
    def analyze_term_window(self,doc,targets,categories=None,window_size=10,normalize=False):
        tokenizer = util.window_tokenizer(window_size,targets)
        return self.analyze(doc,categories,tokenizer,normalize)
    def analyze(self,doc,categories=None,tokenizer="default",normalize=False):
        if isinstance(doc,list):
            doc = "\n".join(doc)
        if tokenizer == "default":
            tokenizer = util.default_tokenizer
        elif tokenizer == "bigrams":
            tokenizer = util.bigram_tokenizer
        if not hasattr(tokenizer,"__call__"):
            raise Exception("invalid tokenizer")
        if not categories:
            categories = self.cats.keys()
        invcats = defaultdict(list)
        key = tuple(sorted(categories))
        if key in self.inv_cache:
            invcats = self.inv_cache[key]
        else:
            for k in categories:
                for t in self.cats[k]: invcats[t].append(k)
            self.inv_cache[key] = invcats
        count = {}
        tokens = 0.0
        for cat in categories: count[cat] = 0.0
        for tk in tokenizer(doc):
            tokens += 1.0
            for cat in invcats[tk]:
                count[cat]+=1.0
        if normalize:
            for cat in count.keys():
                if tokens == 0:
                    return None
                else:
                    count[cat] = count[cat] / tokens
        return count
    def create_category(self,name,seeds,model="fiction",size=100,write=True):
        resp = requests.post(self.backend_url + "/create_category", json={"terms":seeds,"size":size,"model":model})
        #return(resp.text)
        #print(resp.text)
        results = json.loads(resp.text)
        self.cats[name] = list(set(results))
        if write:
            with open(self.base_dir+"/data/user/"+name+".empath","w") as f:
                f.write("\t".join([name]+results))
    def delete_category(self,name):
        if name in self.cats: del self.cats[name]
        filename = self.base_dir+"/data/user/"+name+".empath"
        if os.path.isfile(filename):
            os.remove(filename)

In [22]:
#print(len(wordlist))

In [3]:
# wordlist is created with the empath categories 
lexicon = Empath()
#print(lexicon.base_dir)
emp = lexicon.analyze("Testing", normalize=True)
#print(emp)
#print(emp.items)
wordlist = []
# Loop that deletes the extra 2's that overwrote the original file 
for word, value in emp.items():
    if word[-1] != '2': 
        wordlist.append(word)
    else: 
        #print("okay")
        lexicon.delete_category(word)
print(wordlist)

['help', 'office', 'dance', 'money', 'wedding', 'domestic_work', 'sleep', 'medical_emergency', 'cold', 'hate', 'cheerfulness', 'aggression', 'occupation', 'envy', 'anticipation', 'family', 'vacation', 'crime', 'attractive', 'masculine', 'prison', 'health', 'pride', 'dispute', 'nervousness', 'government', 'weakness', 'horror', 'swearing_terms', 'leisure', 'suffering', 'royalty', 'wealthy', 'tourism', 'furniture', 'school', 'magic', 'beach', 'journalism', 'morning', 'banking', 'social_media', 'exercise', 'night', 'kill', 'blue_collar_job', 'art', 'ridicule', 'play', 'computer', 'college', 'optimism', 'stealing', 'real_estate', 'home', 'divine', 'sexual', 'fear', 'irritability', 'superhero', 'business', 'driving', 'pet', 'childish', 'cooking', 'exasperation', 'religion', 'hipster', 'internet', 'surprise', 'reading', 'worship', 'leader', 'independence', 'movement', 'body', 'noise', 'eating', 'medieval', 'zest', 'confusion', 'water', 'sports', 'death', 'healing', 'legend', 'heroic', 'celebr

In [10]:
lexicon.cats.keys()

dict_keys(['help', 'office', 'dance', 'money', 'wedding', 'domestic_work', 'sleep', 'medical_emergency', 'cold', 'hate', 'cheerfulness', 'aggression', 'occupation', 'envy', 'anticipation', 'family', 'vacation', 'crime', 'attractive', 'masculine', 'prison', 'health', 'pride', 'dispute', 'nervousness', 'government', 'weakness', 'horror', 'swearing_terms', 'leisure', 'suffering', 'royalty', 'wealthy', 'tourism', 'furniture', 'school', 'magic', 'beach', 'journalism', 'morning', 'banking', 'social_media', 'exercise', 'night', 'kill', 'blue_collar_job', 'art', 'ridicule', 'play', 'computer', 'college', 'optimism', 'stealing', 'real_estate', 'home', 'divine', 'sexual', 'fear', 'irritability', 'superhero', 'business', 'driving', 'pet', 'childish', 'cooking', 'exasperation', 'religion', 'hipster', 'internet', 'surprise', 'reading', 'worship', 'leader', 'independence', 'movement', 'body', 'noise', 'eating', 'medieval', 'zest', 'confusion', 'water', 'sports', 'death', 'healing', 'legend', 'heroic

In [4]:
# creates "wordsToUse" which are seed words for each category 

# Open the category file 
catFile = open("categories1.tsv", "r")
#catFile = open("/Users/mahumshah/Downloads/categories1.tsv", "r")
# Outer for loop
# variable to store the seed words that I created 
wordsToUse = []
j = 0 
#print(wordlist)
#categoryList = wordlist 
#print(categoryList)
# loops through each line in the file 
for line in catFile: 
    categoryList = line.strip("\n").split("\t")
    categoryName = categoryList[0]
    # Inner for loop
    i = 0
    seedIndex = 0
    # loops through each item in the categoryList
    for categoryItem in categoryList: 
        # if categoryname is the same as the category item, then the index is stored in seedIndex
        if categoryName == categoryItem: 
            seedIndex = i
        # Edited categories to obtain better seed words 
        if categoryName == "children": 
            #categoryItem == "children"
            categoryName = "child"
            #categoryItem = "child"
        if categoryName == "social_media": 
            categoryName = "multimedia"
        if categoryName == "real_estate": 
            categoryName = "estate"
        if categoryName == "air_travel": 
            categoryName = "fly"
        if categoryName == "competing": 
            categoryName = "competition" 
        i = i + 1
# Access the surrounding words of the category name (5 seed words)
    # when category name doesn't exist 
    if seedIndex == 0: 
        wordsToUse.append(categoryList[seedIndex + 1: seedIndex + 6])
        #print(categoryName)
    # when categoryname is at the end of the list 
    elif seedIndex == len(categoryList) - 1 or seedIndex == len(categoryList) - 2: 
        wordsToUse.append(categoryList[seedIndex - 4: seedIndex + 1])
        #print(categoryName)
    # when categoryname is in position1 or position2 of list 
    elif seedIndex == 1 or seedIndex == 2: 
        wordsToUse.append(categoryList[seedIndex: seedIndex + 5])
        #print(categoryName)
    else: 
        wordsToUse.append(categoryList[seedIndex-2 : seedIndex + 3])
            #print(categoryName)
    #print(wordsToUse)
    #print(categoryName)
    #print(categoryList)
# appends the number 2 to each element in the category list (to differentiate from original wordlist)
myCategories = [str(x) + "2" for x in wordlist]
print(myCategories)
#print(categoryList)
# testing 
#print(wordsToUse[0:5])
#print(wordlist)

['help2', 'office2', 'dance2', 'money2', 'wedding2', 'domestic_work2', 'sleep2', 'medical_emergency2', 'cold2', 'hate2', 'cheerfulness2', 'aggression2', 'occupation2', 'envy2', 'anticipation2', 'family2', 'vacation2', 'crime2', 'attractive2', 'masculine2', 'prison2', 'health2', 'pride2', 'dispute2', 'nervousness2', 'government2', 'weakness2', 'horror2', 'swearing_terms2', 'leisure2', 'suffering2', 'royalty2', 'wealthy2', 'tourism2', 'furniture2', 'school2', 'magic2', 'beach2', 'journalism2', 'morning2', 'banking2', 'social_media2', 'exercise2', 'night2', 'kill2', 'blue_collar_job2', 'art2', 'ridicule2', 'play2', 'computer2', 'college2', 'optimism2', 'stealing2', 'real_estate2', 'home2', 'divine2', 'sexual2', 'fear2', 'irritability2', 'superhero2', 'business2', 'driving2', 'pet2', 'childish2', 'cooking2', 'exasperation2', 'religion2', 'hipster2', 'internet2', 'surprise2', 'reading2', 'worship2', 'leader2', 'independence2', 'movement2', 'body2', 'noise2', 'eating2', 'medieval2', 'zest2',

In [5]:
# Create own categories (using reddit model)
# make sure that myCategories links to the correct words
myZip_object = zip(myCategories, wordsToUse)
#print(list(myZip_object))
for wlist, wordsused in list(myZip_object):
    lexicon.create_category(wlist, wordsused, model = "reddit", size = 100, write = True)
#print(list((myZip_object))

KeyboardInterrupt: 

In [16]:
print(wordlist)

['help', 'office', 'dance', 'money', 'wedding', 'domestic_work', 'sleep', 'medical_emergency', 'cold', 'hate', 'cheerfulness', 'aggression', 'occupation', 'envy', 'anticipation', 'family', 'vacation', 'crime', 'attractive', 'masculine', 'prison', 'health', 'pride', 'dispute', 'nervousness', 'government', 'weakness', 'horror', 'swearing_terms', 'leisure', 'suffering', 'royalty', 'wealthy', 'tourism', 'furniture', 'school', 'magic', 'beach', 'journalism', 'morning', 'banking', 'social_media', 'exercise', 'night', 'kill', 'blue_collar_job', 'art', 'ridicule', 'play', 'computer', 'college', 'optimism', 'stealing', 'real_estate', 'home', 'divine', 'sexual', 'fear', 'irritability', 'superhero', 'business', 'driving', 'pet', 'childish', 'cooking', 'exasperation', 'religion', 'hipster', 'internet', 'surprise', 'reading', 'worship', 'leader', 'independence', 'movement', 'body', 'noise', 'eating', 'medieval', 'zest', 'confusion', 'water', 'sports', 'death', 'healing', 'legend', 'heroic', 'celebr

In [17]:
print(myCategories)

['help2', 'office2', 'dance2', 'money2', 'wedding2', 'domestic_work2', 'sleep2', 'medical_emergency2', 'cold2', 'hate2', 'cheerfulness2', 'aggression2', 'occupation2', 'envy2', 'anticipation2', 'family2', 'vacation2', 'crime2', 'attractive2', 'masculine2', 'prison2', 'health2', 'pride2', 'dispute2', 'nervousness2', 'government2', 'weakness2', 'horror2', 'swearing_terms2', 'leisure2', 'suffering2', 'royalty2', 'wealthy2', 'tourism2', 'furniture2', 'school2', 'magic2', 'beach2', 'journalism2', 'morning2', 'banking2', 'social_media2', 'exercise2', 'night2', 'kill2', 'blue_collar_job2', 'art2', 'ridicule2', 'play2', 'computer2', 'college2', 'optimism2', 'stealing2', 'real_estate2', 'home2', 'divine2', 'sexual2', 'fear2', 'irritability2', 'superhero2', 'business2', 'driving2', 'pet2', 'childish2', 'cooking2', 'exasperation2', 'religion2', 'hipster2', 'internet2', 'surprise2', 'reading2', 'worship2', 'leader2', 'independence2', 'movement2', 'body2', 'noise2', 'eating2', 'medieval2', 'zest2',

In [18]:
# test run
print(myCategories[0], wordsToUse[0])

help2 ['chore', 'responsible', 'help', 'grateful', 'maid']


In [20]:
lexicon.cats 

defaultdict(list,
            {'help': ['encourage',
              'obligation',
              'protect',
              'help',
              'aide',
              'nurse',
              'maid',
              'responsible',
              'counsel',
              'stabilize',
              'duty',
              'rely',
              'generously',
              'financial',
              'guide',
              'patient',
              'tend',
              'advisor',
              'prepare',
              'kindly',
              'benefit',
              'helpful',
              'assist',
              'servant',
              'volunteer',
              'favor',
              'crutch',
              'friend',
              'advise',
              'wheelchair',
              'provide',
              'support',
              'oversee',
              'carry',
              'cooperate',
              'supportive',
              'entrust',
              'chore',
              'assistance',
   

In [47]:
#lexicon.create_category("help",["chore","responsible","help", "grateful", "maid"], model="reddit")

In [None]:
# Reading the text message data frame 
# implement only the participants with at least 2 messages (or more) 
# sort by conversation 
import pandas as pd
import re

# read the data frame 
dataFrame = pd.read_csv("sent14days.csv")
#print(dataFrame)

# obtain ID, texts, scores, and address 2 from dataframe
dataFrame = dataFrame[['id', 'body2', 'scores', 'address2']]
#print(dataFrame.head)

def firstVal(inList): 
    return inList[0]

def concat(inList): 
    return ". ".join(inList)

def scoreList(inList): 
    #print(list(inList))
    if len(inList) == 0: 
        return None
    else: 
        return list(inList)[0]

#group the data by participant and conversation
conversationsWFolds = dataFrame.copy()
conversationsWFolds["conversation"] = conversationsWFolds["id"] + "|TO|" +  conversationsWFolds["address2"]
#columns are id, fold, body2, depression, test and conversation
conversationsWFolds.columns
#print(conversationsWFolds["conversation"])
#group by conversation
groupedConvsWFolds = conversationsWFolds.groupby(by="conversation").agg({"body2": list, "scores": scoreList})
groupedConvsWFolds["Length Body"] = groupedConvsWFolds["body2"].apply(len)
groupedConvsWFolds["body2"] = groupedConvsWFolds["body2"].apply(concat)
#groupedConvsWFoldsReverse = conversationsWFolds.groupby(by="conversation").agg({"fold":firstVal, "body2": concatReverse, "depression": firstVal})

# create a new column to count # of text messages 
#newCol = groupedConvsFolds[['id', 'body2']].groupby(by = "id").agg(len)
#print(newCol)
# Group by ID
#dataFrame = dataFrame.groupby(by = "id").agg({"body2":sortList, "scores": scoreList})
#dataFrame['NumTexts'] = newCol
# only displays participants with 2 or more text messages
groupedConvsWFolds = groupedConvsWFolds[groupedConvsWFolds['Length Body'] >= 2]
#dataFrame.head()
#groupedConvsWFolds.head(40)

In [6]:
# Analyzing empath categories
#pDFt = dataFrame
pDFt = groupedConvsWFolds
for word in wordlist:
    #print(word)
    pctt = []
    for i in range(0, pDFt.shape[0]):
        content = re.sub(r'[^\w\s]', '', str(pDFt.body2[i]).lower())
        lexicon = Empath()
        #print(content)
        emp = lexicon.analyze(content, categories = [word], normalize = True)
        #print(emp)
        if emp != None:
            for key, value in emp.items(): 
                pctt.append(value)
        else:
            pctt.append(0)
    pDFt[word] = pctt
#pDFt.head()
pDFt.to_csv("ogEmpathConvo.csv")

NameError: name 'groupedConvsWFolds' is not defined

In [None]:
print(pDFt)

In [7]:
# Analyze my created categories

pDFt = groupedConvsWFolds
# renaming body2 to text (so that the category body2 doesn't get confused)
pDFt.rename(columns = {'body2':'text'}, inplace = True)
print(pDFt.rename)
#print(pDFt.head())

# loops through my created categories 
for word in myCategories:
    print(word)
    if "2" in word: 
        pctt = []
        for i in range(0, pDFt.shape[0]):
            #print("good")
            content = re.sub(r'[^\w\s]', '', str(pDFt.text[i]).lower())
            lexicon = Empath()
            #print(content)
            emp = lexicon.analyze(content, categories = [word], normalize = True)
            #print(emp)
            if emp != None:
                for key, value in emp.items():
                    pctt.append(value)
            else:
                pctt.append(0)
        pDFt[word] = pctt
#pDFt.head()
pDFt.to_csv("myEmpathConvo.csv")

NameError: name 'groupedConvsWFolds' is not defined