In [1]:
# importing empath 
from empath import Empath
from empath import helpers as util

In [2]:
# importing libraries and empath class 
from collections import defaultdict
import os
import sys
from collections import defaultdict
import requests
import json
class Empath:
    def __init__(self, backend_url="http://54.148.189.209:8000"):
        self.cats = defaultdict(list)
        self.staging = {}
        self.backend_url = backend_url
        self.base_dir = os.path.dirname(util.__file__)
        self.inv_cache = {}
        #loads the default empath categories 
        self.load(self.base_dir+"/data/categories.tsv")
        # loads the user-written categories 
        for f in os.listdir(self.base_dir+"/data/user/"):
            if len(f.split(".")) > 1 and f.split(".")[1] == "empath":
                self.load(self.base_dir+"/data/user/"+f)
    def load(self,file):
        with open(file,"r") as f:
            for line in f:
                cols = line.strip().split("\t")
                name = cols[0]
                terms = cols[1:]
                for t in set(terms):
                    self.cats[name].append(t)
                    #self.invcats[t].append(name)
    def analyze_term_window(self,doc,targets,categories=None,window_size=10,normalize=False):
        tokenizer = util.window_tokenizer(window_size,targets)
        return self.analyze(doc,categories,tokenizer,normalize)
    def analyze(self,doc,categories=None,tokenizer="default",normalize=False):
        if isinstance(doc,list):
            doc = "\n".join(doc)
        if tokenizer == "default":
            tokenizer = util.default_tokenizer
        elif tokenizer == "bigrams":
            tokenizer = util.bigram_tokenizer
        if not hasattr(tokenizer,"__call__"):
            raise Exception("invalid tokenizer")
        if not categories:
            categories = self.cats.keys()
        invcats = defaultdict(list)
        key = tuple(sorted(categories))
        if key in self.inv_cache:
            invcats = self.inv_cache[key]
        else:
            for k in categories:
                for t in self.cats[k]: invcats[t].append(k)
            self.inv_cache[key] = invcats
        count = {}
        tokens = 0.0
        for cat in categories: count[cat] = 0.0
        for tk in tokenizer(doc):
            tokens += 1.0
            for cat in invcats[tk]:
                count[cat]+=1.0
        if normalize:
            for cat in count.keys():
                if tokens == 0:
                    return None
                else:
                    count[cat] = count[cat] / tokens
        return count
    def create_category(self,name,seeds,model="fiction",size=100,write=True):
        resp = requests.post(self.backend_url + "/create_category", json={"terms":seeds,"size":size,"model":model})
        #return(resp.text)
        print(resp.text)
        results = json.loads(resp.text)
        self.cats[name] = list(set(results))
        if write:
            with open(self.base_dir+"/data/user/"+name+".empath","w") as f:
                f.write("\t".join([name]+results))
    def delete_category(self,name):
        if name in self.cats: del self.cats[name]
        filename = self.base_dir+"/data/user/"+name+".empath"
        if os.path.isfile(filename):
            os.remove(filename)

In [3]:
#print(word)

In [3]:
# wordlist is created with the empath categories 
lexicon = Empath()
#print(lexicon.base_dir)
emp = lexicon.analyze("Testing", normalize=True)
#print(emp)
#print(emp.items)
wordlist = []
# Loop that deletes the extra 2's that overwrote the original file 
for word, value in emp.items():
    if word[-1] != '2': 
        wordlist.append(word)
    else: 
        #print("okay")
        lexicon.delete_category(word)
#text_abb = lexicon.create_category("text_abbreviations",["lol","ttyl","brb"], model="reddit")
#wordlist.append(text_abb)
print(wordlist)

['help', 'office', 'dance', 'money', 'wedding', 'domestic_work', 'sleep', 'medical_emergency', 'cold', 'hate', 'cheerfulness', 'aggression', 'occupation', 'envy', 'anticipation', 'family', 'vacation', 'crime', 'attractive', 'masculine', 'prison', 'health', 'pride', 'dispute', 'nervousness', 'government', 'weakness', 'horror', 'swearing_terms', 'leisure', 'suffering', 'royalty', 'wealthy', 'tourism', 'furniture', 'school', 'magic', 'beach', 'journalism', 'morning', 'banking', 'social_media', 'exercise', 'night', 'kill', 'blue_collar_job', 'art', 'ridicule', 'play', 'computer', 'college', 'optimism', 'stealing', 'real_estate', 'home', 'divine', 'sexual', 'fear', 'irritability', 'superhero', 'business', 'driving', 'pet', 'childish', 'cooking', 'exasperation', 'religion', 'hipster', 'internet', 'surprise', 'reading', 'worship', 'leader', 'independence', 'movement', 'body', 'noise', 'eating', 'medieval', 'zest', 'confusion', 'water', 'sports', 'death', 'healing', 'legend', 'heroic', 'celebr

In [5]:
#print(lexicon.cats["help"])

In [6]:
#print(len(lexicon.cats["help2"]))

In [7]:
lexicon.cats.keys()

dict_keys(['help', 'office', 'dance', 'money', 'wedding', 'domestic_work', 'sleep', 'medical_emergency', 'cold', 'hate', 'cheerfulness', 'aggression', 'occupation', 'envy', 'anticipation', 'family', 'vacation', 'crime', 'attractive', 'masculine', 'prison', 'health', 'pride', 'dispute', 'nervousness', 'government', 'weakness', 'horror', 'swearing_terms', 'leisure', 'suffering', 'royalty', 'wealthy', 'tourism', 'furniture', 'school', 'magic', 'beach', 'journalism', 'morning', 'banking', 'social_media', 'exercise', 'night', 'kill', 'blue_collar_job', 'art', 'ridicule', 'play', 'computer', 'college', 'optimism', 'stealing', 'real_estate', 'home', 'divine', 'sexual', 'fear', 'irritability', 'superhero', 'business', 'driving', 'pet', 'childish', 'cooking', 'exasperation', 'religion', 'hipster', 'internet', 'surprise', 'reading', 'worship', 'leader', 'independence', 'movement', 'body', 'noise', 'eating', 'medieval', 'zest', 'confusion', 'water', 'sports', 'death', 'healing', 'legend', 'heroic

In [5]:
# creates "wordsToUse" which are seed words for each category 

# Open the category file 
#catFile = open("/Users/mahumshah/Downloads/categories1.tsv", "r")
catFile = open("categories1.tsv", "r")
# Outer for loop
# variable to store the seed words that I created 
wordsToUse = []
j = 0 
#print(wordlist)
#categoryList = wordlist 
#print(categoryList)
# loops through each line in the file 
for line in catFile: 
    categoryList = line.strip("\n").split("\t")
    categoryName = categoryList[0]
    # Inner for loop
    i = 0
    seedIndex = 0
    # loops through each item in the categoryList
    for categoryItem in categoryList: 
        # if categoryname is the same as the category item, then the index is stored in seedIndex
        if categoryName == categoryItem: 
            seedIndex = i
        # Edited categories to obtain better seed words 
        if categoryName == "children": 
            #categoryItem == "children"
            categoryName = "child"
            #categoryItem = "child"
        if categoryName == "social_media": 
            categoryName = "multimedia"
        if categoryName == "real_estate": 
            categoryName = "estate"
        if categoryName == "air_travel": 
            categoryName = "fly"
        if categoryName == "competing": 
            categoryName = "competition" 
        i = i + 1
# Access the surrounding words of the category name (5 seed words)
    # when category name doesn't exist 
    if seedIndex == 0: 
        wordsToUse.append(categoryList[seedIndex + 1: seedIndex + 6])
        #print(categoryName)
    # when categoryname is at the end of the list 
    elif seedIndex == len(categoryList) - 1 or seedIndex == len(categoryList) - 2: 
        wordsToUse.append(categoryList[seedIndex - 4: seedIndex + 1])
        #print(categoryName)
    # when categoryname is in position1 or position2 of list 
    elif seedIndex == 1 or seedIndex == 2: 
        wordsToUse.append(categoryList[seedIndex: seedIndex + 5])
        #print(categoryName)
    else: 
        wordsToUse.append(categoryList[seedIndex-2 : seedIndex + 3])
            #print(categoryName)
    #print(wordsToUse)
    #print(categoryName)
    #print(categoryList)
# appends the number 2 to each element in the category list (to differentiate from original wordlist)
myCategories = [str(x) + "2" for x in wordlist]
print(myCategories)
#print(categoryList)
# testing 
#print(wordsToUse[0:5])
#print(wordlist)

['help2', 'office2', 'dance2', 'money2', 'wedding2', 'domestic_work2', 'sleep2', 'medical_emergency2', 'cold2', 'hate2', 'cheerfulness2', 'aggression2', 'occupation2', 'envy2', 'anticipation2', 'family2', 'vacation2', 'crime2', 'attractive2', 'masculine2', 'prison2', 'health2', 'pride2', 'dispute2', 'nervousness2', 'government2', 'weakness2', 'horror2', 'swearing_terms2', 'leisure2', 'suffering2', 'royalty2', 'wealthy2', 'tourism2', 'furniture2', 'school2', 'magic2', 'beach2', 'journalism2', 'morning2', 'banking2', 'social_media2', 'exercise2', 'night2', 'kill2', 'blue_collar_job2', 'art2', 'ridicule2', 'play2', 'computer2', 'college2', 'optimism2', 'stealing2', 'real_estate2', 'home2', 'divine2', 'sexual2', 'fear2', 'irritability2', 'superhero2', 'business2', 'driving2', 'pet2', 'childish2', 'cooking2', 'exasperation2', 'religion2', 'hipster2', 'internet2', 'surprise2', 'reading2', 'worship2', 'leader2', 'independence2', 'movement2', 'body2', 'noise2', 'eating2', 'medieval2', 'zest2',

In [9]:
#print(len(lexicon.cats["party2"]))

In [10]:
#print(len(lexicon.cats["help2"]))

In [11]:
#print(len(lexicon.cats["party"]))

In [12]:
#print(myCategories[0])

In [6]:
# Create own categories (using reddit model)
# make sure that myCategories links to the correct words
myZip_object = zip(myCategories, wordsToUse)
#print(list(myZip_object))
for wlist, wordsused in list(myZip_object):
    #if len(lexicon.cats[wlist]) > 0: 
        #lexicon.delete_category(wlist)
    # create the categories 
    #print("testing")
    lexicon.create_category(wlist, wordsused, model = "reddit", size = 100, write = True)
    #print(wlist)
    #print(myCategories)
    #print(wordsused)
#print(list((myZip_object))
#call help2, and make sure it prints out the proper words (help 2 may be empty)

["thankful", "grateful", "helping", "resent", "appreciative", "SMOKE_WEED_EVERYDAY", "family", "loved", "knowing", "nagged", "last_thing", "unhappy", "family/friends", "happy", "it-", "caring", "hired_help", "new_mother", "I/we", "comforting", "obligated", "alone", "huge_relief", "do-", "fortunate", "greatful", "constant_reminder", "stressed", "caretaker", "huge_sacrifice", "special_effort", "just_the_right_thing", "selfishly", "husband", "neglected", "right_thing", "okay", "very_last_thing", "her/him", "that-", "pampered", "alone", "wife", "kind_thing", "nice_gesture", "pamper", "good_wife", "nagging", "own_family", "decent_thing", "wished", "selflessly", "good_thing", "miserable", "kind_person", "resentful", "good_husband", "son/daughter", "involved", "can't/won't", "responsibility", "obliged", "doing", "him-", "neglecting", "it--", "burdened", "extra_mile", "dread", "friends/family", "way-", "wanting", "pampering", "good_reminder", "better", "just_you", "stressful", "house_chores", 

In [10]:
#list1 = lexicon.create_category("sad", ["sad"], model = "nytimes")

In [15]:
print(len(lexicon.cats["help2"]))

89


In [16]:
#print(len(lexicon.cats["help2"]))

In [17]:
print(lexicon.cats["children2"])

['real_parents', 'mothers', 'innocent_baby', 'real_father', 'unborn', 'daughter', 'childs', 'baby_sister', 'granddaughter', 'poor_mother', 'orphan', 'sibling', 'biological_mother', 'grandchild', 'sick_fuck', 'own_son', 'infant', 'parent', 'nine_year_old', 'own_wife', 'newborn_infant', 'horrible_thing', 'innocent_child', 'own_kid', 'child', 'own_child', 'rape_baby', 'unborn_baby', 'children', 'little_girl', 'loving_father', 'actual_child', '_child', 'dead_child', 'stillborn', 'other_children', 'poor_child', 'aborting', 'unborn_children', 'newborn_baby', 'molesting', 'mother/father', 'pregnant_mother', 'shitty_parent', 'rapist', 'child*.', 'own_baby', 'own_children', 'good_mother', 'own_mother', 'own_sister', 'other_child', 'abusive_husband', 'sperm_donor', 'sicko', 'father', "else's_child", 'real_mother', 'unfit_mother', 'poor_boy', 'horrible_parent', 'loving_mother', 'unborn_child', 'crazy_woman', 'unwanted_child', 'infant_child', 'own_daughter', 'baby', 'little_angel', 'custody', 'bad

In [18]:
#print(len(lexicon.cats["help2"]))

In [7]:
print(lexicon.cats["help2"])

['right_thing', 'obliged', 'good_reminder', 'good_thing', 'son/daughter', 'predicament', 'nice_gesture', 'wished', 'wanting', 'extra_mile', 'family/friends', 'thankful', 'pampered', 'dread', 'huge_sacrifice', 'appreciative', 'pampering', 'bother', 'committed', 'knowing', 'fortunate', 'alone', 'good_husband', 'responsibilty', 'just_you', 'decent_thing', 'happy', 'selflessly', 'nagging', 'do-', 'other_people', 'resent', 'grateful', 'better', 'stressed', 'loved', 'husband', 'way-', 'that-', 'SMOKE_WEED_EVERYDAY', 'special_effort', 'him-', 'pamper', 'comforting', 'her/him', 'unhappy', 'friends/family', 'necessary_things', 'selfishly', 'resentful', 'I/we', 'new_mother', 'very_last_thing', 'it--', 'burdened', 'own', 'kind_thing', 'daily_chores', "can't/won't", 'greatful', 'doing', 'good_wife', 'assuring', 'decent_person', 'well', 'nagged', 'helping', 'neglecting', 'stressful', 'wife', 'own_family', 'involved', 'last_thing', 'neglected', 'responsibility', 'kind_person', 'okay', 'obligated', '

In [20]:
print(lexicon.cats["help"])

['tend', 'helper', 'trust', 'assistance', 'protection', 'stabilize', 'cooperate', 'aide', 'encouragement', 'guide', 'prepare', 'housekeeping', 'crutch', 'aid', 'volunteer', 'servant', 'rely', 'serve', 'supportive', 'advisor', 'rescue', 'hospitality', 'assist', 'oversee', 'generously', 'counsel', 'entrust', 'honor', 'escort', 'advise', 'offer', 'provide', 'protect', 'promote', 'kindly', 'request', 'helpful', 'temporary', 'benefit', 'wheelchair', 'help', 'encourage', 'patient', 'obligation', 'duty', 'carry', 'friend', 'treat', 'nurse', 'support', 'housework', 'grateful', 'nursing', 'maid', 'financial', 'chore', 'favor', 'responsible', 'him-', 'bother', 'obliged', 'happy', 'pamper', 'husband', 'thankful', 'good_wife', 'resent', 'knowing', 'well', 'selflessly', 'okay', 'loved', 'special_effort', 'do-', 'huge_relief', 'house_chores', 'predicament', 'constant_reminder', 'family/friends', 'committed', 'just_you', 'her/him', 'kind_person', 'selfishly', 'other_people', 'miserable', 'family', 't

In [21]:
#lexicon.create_category("help",["chore", "responsible","help","grateful", "maid"], model = "reddit")

In [22]:
#print(set(myZip_object))

In [23]:
#print(myCategories)

In [24]:
#print(wordsToUse)

In [25]:
#print(word)

In [27]:
print(wordlist)

['help', 'office', 'dance', 'money', 'wedding', 'domestic_work', 'sleep', 'medical_emergency', 'cold', 'hate', 'cheerfulness', 'aggression', 'occupation', 'envy', 'anticipation', 'family', 'vacation', 'crime', 'attractive', 'masculine', 'prison', 'health', 'pride', 'dispute', 'nervousness', 'government', 'weakness', 'horror', 'swearing_terms', 'leisure', 'suffering', 'royalty', 'wealthy', 'tourism', 'furniture', 'school', 'magic', 'beach', 'journalism', 'morning', 'banking', 'social_media', 'exercise', 'night', 'kill', 'blue_collar_job', 'art', 'ridicule', 'play', 'computer', 'college', 'optimism', 'stealing', 'real_estate', 'home', 'divine', 'sexual', 'fear', 'irritability', 'superhero', 'business', 'driving', 'pet', 'childish', 'cooking', 'exasperation', 'religion', 'hipster', 'internet', 'surprise', 'reading', 'worship', 'leader', 'independence', 'movement', 'body', 'noise', 'eating', 'medieval', 'zest', 'confusion', 'water', 'sports', 'death', 'healing', 'legend', 'heroic', 'celebr

In [8]:
print(myCategories)

['help2', 'office2', 'dance2', 'money2', 'wedding2', 'domestic_work2', 'sleep2', 'medical_emergency2', 'cold2', 'hate2', 'cheerfulness2', 'aggression2', 'occupation2', 'envy2', 'anticipation2', 'family2', 'vacation2', 'crime2', 'attractive2', 'masculine2', 'prison2', 'health2', 'pride2', 'dispute2', 'nervousness2', 'government2', 'weakness2', 'horror2', 'swearing_terms2', 'leisure2', 'suffering2', 'royalty2', 'wealthy2', 'tourism2', 'furniture2', 'school2', 'magic2', 'beach2', 'journalism2', 'morning2', 'banking2', 'social_media2', 'exercise2', 'night2', 'kill2', 'blue_collar_job2', 'art2', 'ridicule2', 'play2', 'computer2', 'college2', 'optimism2', 'stealing2', 'real_estate2', 'home2', 'divine2', 'sexual2', 'fear2', 'irritability2', 'superhero2', 'business2', 'driving2', 'pet2', 'childish2', 'cooking2', 'exasperation2', 'religion2', 'hipster2', 'internet2', 'surprise2', 'reading2', 'worship2', 'leader2', 'independence2', 'movement2', 'body2', 'noise2', 'eating2', 'medieval2', 'zest2',

In [30]:
# test run
print(myCategories[0], wordsToUse[0])

help2 ['chore', 'responsible', 'help', 'grateful', 'maid']


In [36]:
lexicon.cats 

defaultdict(list,
            {'help': ['tend',
              'helper',
              'trust',
              'assistance',
              'protection',
              'stabilize',
              'cooperate',
              'aide',
              'encouragement',
              'guide',
              'prepare',
              'housekeeping',
              'crutch',
              'aid',
              'volunteer',
              'servant',
              'rely',
              'serve',
              'supportive',
              'advisor',
              'rescue',
              'hospitality',
              'assist',
              'oversee',
              'generously',
              'counsel',
              'entrust',
              'honor',
              'escort',
              'advise',
              'offer',
              'provide',
              'protect',
              'promote',
              'kindly',
              'request',
              'helpful',
              'temporary',
              'bene

In [37]:
# testing my categories 
#lexicon.analyze("I need your help", categories = myCategories, normalize = True)

In [48]:
x = lexicon.cats["help"]
print(x)

['tend', 'helper', 'trust', 'assistance', 'protection', 'stabilize', 'cooperate', 'aide', 'encouragement', 'guide', 'prepare', 'housekeeping', 'crutch', 'aid', 'volunteer', 'servant', 'rely', 'serve', 'supportive', 'advisor', 'rescue', 'hospitality', 'assist', 'oversee', 'generously', 'counsel', 'entrust', 'honor', 'escort', 'advise', 'offer', 'provide', 'protect', 'promote', 'kindly', 'request', 'helpful', 'temporary', 'benefit', 'wheelchair', 'help', 'encourage', 'patient', 'obligation', 'duty', 'carry', 'friend', 'treat', 'nurse', 'support', 'housework', 'grateful', 'nursing', 'maid', 'financial', 'chore', 'favor', 'responsible', 'him-', 'bother', 'obliged', 'happy', 'pamper', 'husband', 'thankful', 'good_wife', 'resent', 'knowing', 'well', 'selflessly', 'okay', 'loved', 'special_effort', 'do-', 'huge_relief', 'house_chores', 'predicament', 'constant_reminder', 'family/friends', 'committed', 'just_you', 'her/him', 'kind_person', 'selfishly', 'other_people', 'miserable', 'family', 't

In [49]:
lexicon.cats.keys()

dict_keys(['help', 'office', 'dance', 'money', 'wedding', 'domestic_work', 'sleep', 'medical_emergency', 'cold', 'hate', 'cheerfulness', 'aggression', 'occupation', 'envy', 'anticipation', 'family', 'vacation', 'crime', 'attractive', 'masculine', 'prison', 'health', 'pride', 'dispute', 'nervousness', 'government', 'weakness', 'horror', 'swearing_terms', 'leisure', 'suffering', 'royalty', 'wealthy', 'tourism', 'furniture', 'school', 'magic', 'beach', 'journalism', 'morning', 'banking', 'social_media', 'exercise', 'night', 'kill', 'blue_collar_job', 'art', 'ridicule', 'play', 'computer', 'college', 'optimism', 'stealing', 'real_estate', 'home', 'divine', 'sexual', 'fear', 'irritability', 'superhero', 'business', 'driving', 'pet', 'childish', 'cooking', 'exasperation', 'religion', 'hipster', 'internet', 'surprise', 'reading', 'worship', 'leader', 'independence', 'movement', 'body', 'noise', 'eating', 'medieval', 'zest', 'confusion', 'water', 'sports', 'death', 'healing', 'legend', 'heroic

In [9]:
# Reading the text message data frame 
import pandas as pd
import re
# read the data frame 
dataFrame = pd.read_csv("sent14days.csv")


# obtain ID, texts, and scores from dataframe
dataFrame = dataFrame[['id', 'body2', 'scores']]

# Group data frame by participant ID 
def sortList(inList):
    newList = "".join(inList)
    return newList

def scoreList(inList): 
    #print(list(inList))
    if len(inList) == 0: 
        return None
    else: 
        return list(inList)[0]
    
# create a new column to count # of text messages 
newCol = dataFrame[['id', 'body2']].groupby(by = "id").agg(len)
print(newCol)
# Group by ID
dataFrame = dataFrame.groupby(by = "id").agg({"body2":sortList, "scores": scoreList})
dataFrame['NumTexts'] = newCol
# only displays participants with 2 or more text messages
dataFrame = dataFrame[dataFrame['NumTexts'] >= 2]
#dataFrame.head()

       body2
id          
e122       2
e1526      8
e2806    142
e3702     58
e433       3
...      ...
m9751      5
m9886     60
m9928      5
m9968      4
m9984    244

[129 rows x 1 columns]


In [10]:
#generate csv files for my features for empath's categories 
# Analyzing empath categories
pDFt = dataFrame
for word in wordlist:
    #print(word)
    pctt = []
    for i in range(0, pDFt.shape[0]):
        #print("good")
        content = re.sub(r'[^\w\s]', '', str(pDFt.body2[i]).lower())
        lexicon = Empath()
        #print(content)
        emp = lexicon.analyze(content, categories = [word], normalize = True)
        #print(emp)
        if emp != None:
            for key, value in emp.items(): 
                pctt.append(value)
        else:
            pctt.append(0)
    pDFt[word] = pctt
pDFt.head()
pDFt.to_csv("ogEmpath.csv")

KeyboardInterrupt: 

In [None]:
# Analyze my created categories

pDFt = dataFrame
# renaming body2 to text (so that the category body2 doesn't get confused)
pDFt.rename(columns = {'body2':'text'}, inplace = True)
print(pDFt.rename)
#print(pDFt.head())

# loops through my created categories 
for word in myCategories:
    #print(word)
    pctt = []
    for i in range(0, pDFt.shape[0]):
        #print("good")
        content = re.sub(r'[^\w\s]', '', str(pDFt.text[i]).lower())
        lexicon = Empath()
        #print(content)
        emp = lexicon.analyze(content, categories = [word], normalize = True)
        #print(emp)
        if emp != None:
            for key, value in emp.items():
                pctt.append(value)
        else:
            pctt.append(0)
    pDFt[word] = pctt
#pDFt.head()
pDFt.to_csv("myEmpath.csv")

In [12]:
lexicon.analyze("The loneliness I've been feeling is heartbreaking", categories= ["sadness"])

{'sadness': 5.0}

In [13]:
lexicon.analyze("The loneliness I've been feeling is heartbreaking", categories= ["sadness2"])

{'sadness2': 2.0}