In [2]:
import spacy
import numpy as np
import matplotlib.pyplot as plt
import glob
import re
from collections import Counter
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from analysis_code.analyze_corpus import *

In [3]:
Directory = '/home/dashi/lulu/corpus/NRC-Emotion-Lexicon-v0.92/'
# Each line has the following format:
# <term>--<NearSynonyms><tab><AffectCategory><tab><AssociationFlag>
# <term> is a word for which emotion associations are provided;
# <NearSynonyms> is a set of one to three comma-separated words that indicate the sense of the <term>. The affect annotations are for this sense of the term.
# <AffectCategory> is one of eight emotions (anger, fear, anticipation, trust, surprise, sadness, joy, or disgust) or one of two polarities (negative or positive);
# <AssociationFlag> has one of two possible values: 0 or 1. 0 indicates that the target word has no association with affect category, whereas 1 indicates an association.
WordSenseFile = 'NRC-Emotion-Lexicon-Senselevel-v0.92.txt'
# Each line has the following format:
# <term><tab><AffectCategory><tab><AssociationFlag>
# <term> is a word for which emotion associations are provided;
# <AffectCategory> is one of eight emotions (anger, fear, anticipation, trust, surprise, sadness, joy, or disgust) or one of two polarities (negative or positive);
# <AssociationFlag> has one of two possible values: 0 or 1. 0 indicates that the target word has no association with affect category, whereas 1 indicates an association.
WordFile = 'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'

outFile ='Generated-EmotionWords.txt'

In [34]:
def processSenseFile(filename):
    '''
    Create a list of emotion words which has non-zero polarity
    input : sense file
    return : set of emotion words
    '''
    emotionWords = set([])
    with open(filename, 'r', errors='replace') as f:
        line = f.readline()
        while line:
            words = line.strip().split()
            if len(words) == 5: 
                category = words[3]
                sentiment = words[4]
                word = re.split(r'(\W+)', words[0])[0]
                if (word == 'paraphrase'): 
                    print(words)
                if ('positive' == category or 'negative' == category) and ('1' == sentiment):
                    emotionWords.add(word)
            line = f.readline()            
    return emotionWords

def processMoreSenseFile(filename):
    '''
    Create a list of emotion words which has non-zero polarity
    input : sense file
    return : dictionary of emotion word : emotions, scores
    '''
    emotionScores = {}
    with open(filename, 'r', errors='replace') as f:
        i = 1
        line = f.readline()
        count = 0
        while line: 
            words = line.strip().split()
            word = re.split(r'(\W+)', words[0])[0]
            sentiment = words[-1]
            category = words[-2]
            
            if ('positive' == category or 'negative' == category) and ('1' == sentiment):
                emotionScores[word] = 0
            elif ('positive' != category and 'negative' != category) and ('1' == sentiment):
                count += 1
            if i % 8 == 0: 
                if word in emotionScores: 
                    emotionScores[word] = count
                count = 0
            line = f.readline()
            i += 1 
    return emotionScores

def processWordFile(filename):
    '''
    input : word file 
    return : set of emotion words
    '''
    emotionWords = set([])
    with open(filename, 'r', errors='replace') as f:
        line = f.readline()
        while line:
            words = line.strip().split()
            if len(words) == 3: 
                word = words[0]
                category = words[1]
                sentiment = words[2]
                if ('positive' == category or 'negative' == category) and ('1' == sentiment):
                    emotionWords.add(word)
            line = f.readline()            
    return emotionWords


def generateSentimentScore(filename):
    '''
    input : sense file 
    return : set of emotion words
    '''
    emotionWords = {}
    with open(filename, 'r', errors='replace') as f:
        line = f.readline()
        i = 1 
        numcat = 0
        polarity = 0
        word = ''
        while line:
            # new word
            if (i % 10 == 0): 
                emotionWords = (word, numcat, polarity)
                numcat = 0
                polarity = 0
                    
            words = line.strip().split()                
            if len(words) == 5: 
                category = words[3]
                sentiment = words[4]
                word = re.split(r'(\W+)', words[0])[0]
                if ('1' == sentiment):
                    if ('positive' == category or 'negative' == category):
                        polarity = 1
                    else: 
                        numcat += 1 
            line = f.readline()  
            i += 1
    return emotionWords

# def filterScore(emotionWords):
#     result = {}
#     for e in emotionWords: 
#         word = e[0] 
#         numcat = e[1]
#         polarity = e[2]
#         if (polarity != 0): 
#             result.add(word)
#     return {k:v for k, v in emotionWords.items() if v[1] == 1}

In [33]:
'''
Create emotion word set, no associations, feelings, or sentiment categories
'''
# file1 = Directory + WordSenseFile
words = processSenseFile(Directory + 'NRC-example_1.txt')
print(words)

{'guide', 'harm'}


In [7]:
'''
Create emotion word set, no associations, feelings, or sentiment categories
'''
words = processWordFile(Directory + 'NRC-example_2.txt')
print(words)

{'abandon'}


In [35]:
senseWords = processSenseFile(Directory + WordSenseFile)
print('number of emotion words : ' + str(len(senseWords)))
print('first 10 words : ' + str(list(senseWords)[0:10]))

['paraphrase--copy,', 'burlesque,', 'travesty', 'fear', '0']
['paraphrase--copy,', 'burlesque,', 'travesty', 'anger', '0']
['paraphrase--copy,', 'burlesque,', 'travesty', 'anticip', '0']
['paraphrase--copy,', 'burlesque,', 'travesty', 'trust', '0']
['paraphrase--copy,', 'burlesque,', 'travesty', 'surprise', '0']
['paraphrase--copy,', 'burlesque,', 'travesty', 'positive', '0']
['paraphrase--copy,', 'burlesque,', 'travesty', 'negative', '0']
['paraphrase--copy,', 'burlesque,', 'travesty', 'sadness', '0']
['paraphrase--copy,', 'burlesque,', 'travesty', 'disgust', '0']
['paraphrase--copy,', 'burlesque,', 'travesty', 'joy', '0']
['paraphrase--interpretation,', 'equivalent,', 'synonym', 'fear', '0']
['paraphrase--interpretation,', 'equivalent,', 'synonym', 'anger', '0']
['paraphrase--interpretation,', 'equivalent,', 'synonym', 'anticip', '0']
['paraphrase--interpretation,', 'equivalent,', 'synonym', 'trust', '0']
['paraphrase--interpretation,', 'equivalent,', 'synonym', 'surprise', '0']
['pa

In [9]:
wordWords = processWordFile(Directory + WordFile)
print('number of emotion words : ' + str(len(wordWords)))
print('first 10 words : ' + str(list(wordWords)[0:10]))

number of emotion words : 5555
first 10 words : ['unique', 'vengeful', 'amusing', 'pay', 'informer', 'plight', 'overpaid', 'income', 'eagerness', 'knotted']


In [10]:
len(senseWords & wordWords)

1894

In [11]:
'''
Use sense words only
'''
with open(Directory + outFile, "w") as outfile:
    for w in senseWords:
        outfile.write(w + '\n') 
outfile.close()
    

In [12]:
senseScore = generateSentimentScore(Directory + WordSenseFile)


In [13]:
senseScore

{'paraphrase': (0, 0),
 'porosity': (0, 0),
 'invaluable': (0, 0),
 'virulence': (2, 1),
 'complexity': (0, 1),
 'chapter': (0, 0),
 'untranslated': (0, 0),
 'swell': (0, 0),
 'presumption': (1, 0),
 'hood': (0, 0),
 'tolerate': (2, 1),
 'prove': (0, 1),
 'saving': (0, 0),
 'aggressor': (2, 1),
 'theism': (1, 1),
 'sphere': (0, 0),
 'delayed': (0, 1),
 'backfire': (0, 0),
 'curtain': (0, 0),
 'weird': (0, 0),
 'innovate': (0, 1),
 'act': (0, 0),
 'resort': (0, 0),
 'void': (0, 0),
 'disregarded': (1, 1),
 'fair': (0, 1),
 'press': (0, 0),
 'pronounce': (0, 0),
 'hate': (3, 1),
 'gob': (1, 0),
 'horticultural': (0, 0),
 'organization': (0, 1),
 'foolish': (0, 0),
 'vegetarianism': (0, 0),
 'association': (0, 0),
 'phony': (0, 1),
 'port': (0, 0),
 'vista': (0, 0),
 'dermatology': (0, 0),
 'tribulation': (2, 1),
 'spill': (0, 0),
 'sway': (0, 0),
 'failing': (3, 1),
 'economics': (0, 0),
 'gills': (0, 0),
 'politics': (0, 0),
 'forearm': (2, 0),
 'overtaken': (0, 0),
 'estuary': (0, 0),


In [25]:
len(senseScore)

4819

In [32]:
senseScore

{'paraphrase': (0, 0),
 'porosity': (0, 0),
 'invaluable': (0, 0),
 'virulence': (2, 1),
 'complexity': (0, 1),
 'chapter': (0, 0),
 'untranslated': (0, 0),
 'swell': (0, 0),
 'presumption': (1, 0),
 'hood': (0, 0),
 'tolerate': (2, 1),
 'prove': (0, 1),
 'saving': (0, 0),
 'aggressor': (2, 1),
 'theism': (1, 1),
 'sphere': (0, 0),
 'delayed': (0, 1),
 'backfire': (0, 0),
 'curtain': (0, 0),
 'weird': (0, 0),
 'innovate': (0, 1),
 'act': (0, 0),
 'resort': (0, 0),
 'void': (0, 0),
 'disregarded': (1, 1),
 'fair': (0, 1),
 'press': (0, 0),
 'pronounce': (0, 0),
 'hate': (3, 1),
 'gob': (1, 0),
 'horticultural': (0, 0),
 'organization': (0, 1),
 'foolish': (0, 0),
 'vegetarianism': (0, 0),
 'association': (0, 0),
 'phony': (0, 1),
 'port': (0, 0),
 'vista': (0, 0),
 'dermatology': (0, 0),
 'tribulation': (2, 1),
 'spill': (0, 0),
 'sway': (0, 0),
 'failing': (3, 1),
 'economics': (0, 0),
 'gills': (0, 0),
 'politics': (0, 0),
 'forearm': (2, 0),
 'overtaken': (0, 0),
 'estuary': (0, 0),


In [26]:
words = filterScore(senseScore)

In [27]:
len(words)

1172

In [28]:
words

{'receiving': (2, 1),
 'tribute': (0, 1),
 'revive': (0, 1),
 'aggressor': (2, 1),
 'complexity': (0, 1),
 'absentee': (1, 1),
 'providing': (0, 1),
 'dexterity': (0, 1),
 'staring': (0, 1),
 'banish': (0, 1),
 'tolerate': (2, 1),
 'prove': (0, 1),
 'anaconda': (2, 1),
 'virulence': (2, 1),
 'theism': (1, 1),
 'heroism': (3, 1),
 'expel': (4, 1),
 'equilibrium': (0, 1),
 'delayed': (0, 1),
 'dust': (0, 1),
 'innovate': (0, 1),
 'champion': (2, 1),
 'kitten': (1, 1),
 'acquiring': (1, 1),
 'lacking': (0, 1),
 'prohibition': (0, 1),
 'disregarded': (1, 1),
 'fair': (0, 1),
 'wot': (1, 1),
 'refusal': (0, 1),
 'vegetative': (2, 1),
 'renunciation': (0, 1),
 'graciously': (0, 1),
 'cute': (0, 1),
 'nutritious': (1, 1),
 'organization': (0, 1),
 'dispute': (1, 1),
 'foreclose': (2, 1),
 'expatriate': (0, 1),
 'phony': (0, 1),
 'acknowledgment': (0, 1),
 'vindicated': (0, 1),
 'minimize': (0, 1),
 'picketing': (1, 1),
 'tribulation': (2, 1),
 'chivalry': (0, 1),
 'accredited': (1, 1),
 'excl

In [31]:
senseWords

{'paraphrase',
 'virulence',
 'aggressor',
 'complexity',
 'tolerate',
 'prove',
 'erratum',
 'theism',
 'delayed',
 'innovate',
 'fasting',
 'champion',
 'disregarded',
 'fair',
 'intellectual',
 'organization',
 'foolish',
 'disorder',
 'phony',
 'resign',
 'tribulation',
 'failing',
 'ken',
 'homosexuality',
 'butt',
 'usual',
 'uninitiated',
 'relief',
 'compliance',
 'unimproved',
 'academy',
 'fell',
 'untoward',
 'dictatorial',
 'monster',
 'restriction',
 'danger',
 'critic',
 'irregularity',
 'apologize',
 'lurking',
 'opiate',
 'silly',
 'conscientious',
 'patience',
 'exile',
 'strengthening',
 'poke',
 'ancient',
 'absence',
 'suppress',
 'presentable',
 'distortion',
 'advise',
 'romp',
 'revise',
 'dogged',
 'recognizable',
 'trophy',
 'prestige',
 'doit',
 'reorganize',
 'contentious',
 'sacrifices',
 'eavesdropping',
 'treat',
 'sour',
 'helplessness',
 'thrift',
 'adjunct',
 'words',
 'undo',
 'admiration',
 'cyanide',
 'rail',
 'slaughtering',
 'prophylactic',
 'oblit