In [2]:
import pandas as pd
import ast
from EmotionAnalysis.SentSemanticModule import *
from EmotionAnalysis.SentTweetModule import *

## I. Training Word2Vec Model for Multiple Languages:

### 1. Loading Tokenized Lemmatized Representation before further cleaning:

In [8]:
# Loading Tokenized Lemmatized
tokenized_df = pd.read_csv('../../Results/Sample Affective Representation.csv',encoding="ISO-8859-1")

# Converting them into a list
tokenized_lemma = list(tokenized_df['Tokenized Lemmatized'])
tokenized_lemmatized_tweets = []
for i in range(0, len(tokenized_lemma)):
    result = ast.literal_eval(tokenized_lemma[i])
    tokenized_lemmatized_tweets.append(result)
tokenized_lemmatized_tweets[0]

[u'still', u'the', u'best', u'coffee', u'in', u'town', u'at', u'la', u'stanza']

### 2. Training and Fine-Tuning Word Embedding Model using gensim word2vec: 

In [10]:
# PARAMETERS TO BE TUNED:

# Word vector dimensionality                      
# Minimum word count                        
# Number of threads to run in parallel
# Context window size                                                                                    
# Downsample setting for frequent words

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality
min_word_count = 4  # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print ("Training model...")
model = word2vec.Word2Vec(tokenized_lemmatized_tweets, workers=num_workers,
            size=num_features, min_count = min_word_count,
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "../../Models/en_sample"
model.save(model_name)

Training model...


## II. Applying Rule Based Lexicon Approach Extended with Word2Vec Methodology:

### 1. Loading Tweets in their Affective Representation form:

In [14]:
nava_repr = list(tokenized_df['Nava Representation'])
# Convert nava_tweets 
nava_tweets = []
for i in range(0, len(nava_repr)):
    result = ast.literal_eval(nava_repr[i])
    nava_tweets.append(result)
nava_tweets[0]

[u'still', u'best', u'town']

### 2. Loading Language Lexicon (version containing set of representative words): 

In [16]:
lexicon_df = pd.read_csv('../../NRCLexicon/English/lexicon_nrc.csv',encoding='ISO-8859-1')
lexicon_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,cussed,conjure,foul,foul,tantalizing,fawn,conformance,scold,conjure,digit
1,foul,immature,scold,aggression,elegant,inadequacy,eligible,dissolution,originality,admiral
2,aggression,tantalizing,screaming,scold,buddy,foul,electricity,cytomegalovirus,dissolution,specialist
3,scold,buddy,hanging,dissolution,oasis,narcotic,originality,hanging,tantalizing,reporter
4,dissolution,oasis,loathing,screaming,symphony,conjuring,tantalizing,sterile,shriek,buddy


### 3. Loading Word2Vec Model (if it was already Pre-trained):

In [17]:
print ("Loading Word2Vec ....")
from gensim.models import word2vec
model = word2vec.Word2Vec.load('../../Models/en_sample')

Loading Word2Vec ....


In [19]:
def compute_matrix_sentences_list_word2vec(nava_words, nrc_lexicon,model):
    """

    :param word2vec model:
    :param nava_words: we can pass any version of the bag of words
    :param nrc_lexicon:
    :return:
    """

    sm_list = list_nrc_lexicon(nrc_lexicon)
    emotions = nrc_lexicon.columns.values
    matrix_sentences_list = []
    for i in range(0, len(nava_words)): # Iterate over all sentences
        " Initialize matrix for each sentence "
        w, h = len(nava_words[i]), 10
        matrix_sentence = [[0 for x in range(w)] for y in range(h)]
        k = 0
        for word in nava_words[i]: # Iterate over all words in the sentence
            j = 0
            for emotion in range(0, len(emotions)): # Iterate over all emotions => fill in the emotional vectors for all words
                total_similarity = 0
                for representative_word in sm_list[emotion]:
                    r = len(sm_list[emotion])
                    if word in model and representative_word in model:
                        total_similarity += model.similarity(word, representative_word)
                matrix_sentence[j][k] += total_similarity / r 
                j += 1 # increment index of representative words
            k += 1 # increment index of transcript words
        # append the matrix_sentence to the global list for all sentences
        matrix_sentences_list.append(matrix_sentence)
    return matrix_sentences_list

### 4. Computing Word Level Emotional Vectors:

In [20]:
###### STEP 4: Word Level
print ("Computing word level scores")
matrix_sentences_word2vec = compute_matrix_sentences_list_word2vec(nava_tweets,lexicon_df,model)

Computing word level scores


In [22]:
from __future__ import division

def compute_sentence_emotion_vectors(matrix_sentences_list):
    emotion_vector_list = []
    for i in range(0, len(matrix_sentences_list)):
        sum_sentence = []
        ids = [0,1,2,3,4,7,8,9]
        for j in ids: # for each emotion
            sum_words = 0
            for k in range(0, len(matrix_sentences_list[i][j])):
                sum_words += matrix_sentences_list[i][j][k]*1000
            r = len(matrix_sentences_list[i])
            if r != 0 :
                sum_words = sum_words / r # Arithmetic mean
            sum_sentence.append(sum_words)
        emotion_vector_list.append(sum_sentence)
    return emotion_vector_list

def compute_sentence_sentiment_vectors(matrix_sentences_list):
    emotion_vector_list = []
    for i in range(0, len(matrix_sentences_list)):
        sum_sentence = []
        ids = [5,6]
        for j in ids: # for each emotion
            sum_words = 0
            for k in range(0, len(matrix_sentences_list[i][j])):
                sum_words += matrix_sentences_list[i][j][k]*1000
            r = len(matrix_sentences_list[i])
            if r != 0 : 
                sum_words = sum_words / r
            sum_sentence.append(sum_words)
        emotion_vector_list.append(sum_sentence)
    return emotion_vector_list

def compute_emotionalities(sentence_vectors):
    emotionalities = []
    threshold = 0 # THRESHOLD PARAMETER TO BE FINE TUNED (0 for lexicon, 0.2 for pmi)
    for i in range(0,len(sentence_vectors)):
        sentence_vector = sentence_vectors[i]
        mylist = [0 if math.isnan(x) else x for x in sentence_vector]
        if (max(mylist) > threshold): #Threshold 
            emotionalities.append(sentence_vectors[i].index(max(mylist)))
        else: 
            emotionalities.append(8)
    return emotionalities

def compute_sentiments(sentence_vectors_sent,emotionalities):
    sentiments = []
    threshold = 0 # THRESHOLD PARAMETER TO BE FINE TUNED (0 for lexicon, 0.2 for pmi)
    for i in range(0,len(sentence_vectors_sent)):
        sentence_vector = sentence_vectors_sent[i]
        mylist = [0 if math.isnan(x) else x for x in sentence_vector]
        if (max(mylist) > threshold): #Threshold 
            sentiments.append(sentence_vectors_sent[i].index(max(mylist)))
        else:
            # To increase Recall, we also use emotionalities, in case a tweet is neutral
            if emotionalities[i] in [0,2,3,5]:
                sentiments.append(0) # Negative Emotion
            if emotionalities[i] in [1,4,6,7]:
                sentiments.append(1) # Positive Emotion
            if emotionalities[i] == 8:
                sentiments.append(2) # Otherwise, we just return Neutral
    return sentiments

### 5. Computing Tweet Level Emotionalities: 

In [31]:
print ("Computing Emotionalities ...")
# Computing vectors of emotional scores by averaging over the word emotion scores
sentence_vectors_word2vec = compute_sentence_emotion_vectors(matrix_sentences_word2vec)

print "\nEmotional Vectors >>>>"
print sentence_vectors_word2vec[0:5]
print "\n"
# Selecting dominant emotion using a specific threshold
emotionalities = compute_emotionalities(sentence_vectors_word2vec)

print "Dominant Emotions Ids >>>>"
print emotionalities[0:5]

Computing Emotionalities ...

Emotional Vectors >>>>
[[0.0, 0.0, 0.0, 0.0, 0.86966970083820316, 0.25169308940941049, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.57946677978576888, 0.16778532842800961, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.8695853898720538, 0.25174701455266491, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.20845536238516868, 0.058964488861077233, 0.0, 0.0]]


Dominant Emotions Ids >>>>
[4, 4, 8, 4, 4]


### 6. Computing Tweet Level Sentiments:

In [35]:
print ("Computing sentiments ...")

# Computing vectors of polarity scores by averaging over the word polarity scores
sentence_vectors_sent = compute_sentence_sentiment_vectors(matrix_sentences_word2vec)

print "\nSentiment Vectors >>>>"
print sentence_vectors_sent[0:2]
print "\n"

# Selecting dominant polarity (positive, negative) using a specific threshold
sentiments = compute_sentiments(sentence_vectors_sent,emotionalities)

print "Dominant Sentiments Ids >>>>"
print sentiments[0:2]

Computing sentiments ...

Sentiment Vectors >>>>
[[0.0, 0.0], [0.0, 0.0]]


Dominant Sentiments Ids >>>>
[1, 1]


In [36]:
###### Storing Emotion + Sentiment for each tweet in two lists to be used 
# for storing in dataframe:
emo_dict = {
    0: 'Anger',
    1: 'Anticipation',
    2: 'Disgust',
    3: 'Fear',
    4: 'Joy',
    5: 'Sadness',
    6: 'Surprise',
    7: 'Trust',
    8: 'Neutral'
}
sent_dict = {
    0: "Negative",
    1: "Positive",
    2: "Neutral"
}

emotions = []
senti = []
for i in range(0,len(emotionalities)):
    emotions.append(emo_dict[emotionalities[i]])
    senti.append(sent_dict[sentiments[i]])

### 7. Storing emotion, sentiment and their score vectors in dataframe:

In [37]:
print ("Storing in dataframe ... ")
word2vec_results_df = pd.DataFrame()

word2vec_results_df['Nava Tweet'] = nava_tweets

word2vec_results_df['Emotion Vectors'] = sentence_vectors_word2vec

word2vec_results_df['Emotion'] = emotions

word2vec_results_df['Sentiment Vectors'] = sentence_vectors_sent

word2vec_results_df['Sentiment'] = senti

word2vec_results_df.to_csv('../../Results/Sample Tweets Labelled Word2Vec.csv')
word2vec_results_df.head()

Storing in dataframe ... 


Unnamed: 0,Nava Tweet,Emotion Vectors,Emotion,Sentiment Vectors,Sentiment
0,"[still, best, town]","[0.0, 0.0, 0.0, 0.0, 0.869669700838, 0.2516930...",Joy,"[0.0, 0.0]",Positive
1,"[get, ready]","[0.0, 0.0, 0.0, 0.0, 0.579466779786, 0.1677853...",Joy,"[0.0, 0.0]",Positive
2,[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",Neutral,"[0.0, 0.0]",Neutral
3,"[when, send, photo]","[0.0, 0.0, 0.0, 0.0, 0.869585389872, 0.2517470...",Joy,"[0.0, 0.0]",Positive
4,"[oust, mayor]","[0.0, 0.0, 0.0, 0.0, 0.208455362385, 0.0589644...",Joy,"[0.0, 0.0]",Positive
