In [1]:
import pandas as pd
from EmotionAnalysis.DataSchemaExtractionParsing import *
from EmotionAnalysis.DataPreProcessing import *
from EmotionAnalysis.SentSemanticModule import *
from EmotionAnalysis.SentTweetModule import *
from EmotionAnalysis.SentSyntacticModule import *
import math

## Purpose of this notebook:
In this notebook, we demonstrate the different steps followed in order to come up with a refined representation of each tweet by following two principles:
* Word Qualification: application of stop word removal, part of speech tagging and named entity recognition and term normalization to keep good refined emotional candidates. 
* Inter-Word Relationships: Application of syntactic analysis to study three kinds of dependencies:
    * Negation Dependency: e.g. I am not happy 
    * Adjectival Dependency: 
    * Adverbial Dependency
    
#### NB:
This notebook makes direct calls to functions defined in EmotionAnalysis folder. Please refer to that in order to see details of implementation of different steps of the pipeline:
* Pre processing
* Syntactic Module
* Semantic Word Level Module
* Semantic Tweet Level Module

We have runned the same code on the whole dataset chunk by chunk with several variations depending on the language and the libraries available for that specific language. But, for demonstration purposes, in this notebook, we show the process for a small subset.

## I. Loading English Data 

In [2]:
english_tweets = pd.read_csv("../../Data/Sample Data/en_sample.csv",encoding ="utf-8")
english_tweets.head()

Unnamed: 0,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,source,truncated,...,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation,swiss,canton,language
0,9514846412,7198282.0,2010-02-23 06:22:40,Still the best coffee in town — at La Stanza h...,8.53781,47.3678,\N,\N,550.0,,...,http://gowalla.com/,Nico Luchsinger,halbluchs,1820.0,703.0,4687.0,"Zurich, Switzerland",yes,ZH,en
1,9516952605,14703863.0,2010-02-23 07:51:47,Getting ready.. http://twitpic.com/14v8gz,8.81749,47.2288,\N,\N,62.0,,...,http://stone.com/Twittelator,Urs,ugro,75.0,161.0,1390.0,"Zürich, Switzerland",yes,SG,en
2,9517916537,13535402.0,2010-02-23 08:35:39,I'm at Online PC Magazin in Adliswil http://go...,8.5301,47.3152,\N,\N,550.0,,...,http://gowalla.com/,Patrick Hediger,hediger,1511.0,682.0,12157.0,"Zurich, Switzerland",yes,ZH,en
3,9519149278,14260616.0,2010-02-23 09:32:09,@eyeem When and how can we send photos ? One p...,8.29953,47.4829,\N,9518986782,1.0,,...,http://twitter.com/#!/download/iphone,Roman Keller,RomanKeller,720.0,821.0,7337.0,Switzerland,yes,AG,en
4,9523488851,12391922.0,2010-02-23 12:30:04,I just ousted @keepthebyte as the mayor of Day...,7.59,47.555,\N,\N,3.0,,...,http://foursquare.com,Gabriel Walt,GabrielWalt,1445.0,1627.0,1507.0,"Basel, Switzerland",yes,BS,en


In [3]:
len(english_tweets)

10000

Example of a tweet before applying any processing (we will use this example and several others to show how our steps refine an intermediary representation before applying any emotion recognition methodology):

In [4]:
english_tweets['text'].iloc[160]

u'Wow so cool !!! http://twitpic.com/17ean4 - Golden Gate Bridge, San Francisco #hipstamatic #iphoneography #sanfrancisco /via @P_McBride'

In [5]:
english_tweets['text'][9]

u"Read & Learn about scalability!!! A brief interview with me about how we're using @cassandra at @twitter: http://bit.ly/bBadzO /via @rk"

## II. Preliminary Pre-processing: 

### 1. Replacing Special Categories:
We first start by dealing with some string patterns that are particular to the case of Twitter Data:
* Urls: we detect and remove data as they don't carry any emotional importance
* Digits: we remove them for the same reason
* Detecting @ instances with <username> and removing it
* Removing hashtag # sign and keeping the word after hashtag

In [6]:
replaced_categories = handle_special_categories(english_tweets)

In [7]:
replaced_categories['text'].iloc[160]

u'Wow so cool !!!  - Golden Gate Bridge, San Francisco hipstamatic iphoneography sanfrancisco /via '

We notice here that url, username and hashtag sign have been removed

### 2. Replacing contractions (needed for more accurate tokenization)
e.g, "they're stunning" becomes "they are stunning". Otherwise, if we use tokenization right away, it will split into two words: they and 're and 're will cannot be removed since it is not part of stopwords.  

In [8]:
tweets_no_contractions = replace_contractions(replaced_categories)

In [9]:
tweets_no_contractions['text'][9]

u'Read & Learn about scalability!!! A brief interview with me about how we are using  at   /via '

Notice here "how we're" has been replaced by "how we are".

### 3. Tokenization of Tweets into words:

In [15]:
tokenized_list = bag_of_word_representation(tweets_no_contractions)
tokenized_list[10]

[u'Good',
 u'morning',
 u'Black',
 u'Eyed',
 u'Peas',
 u'in',
 u'my',
 u'ears',
 u'finalizing',
 u'new',
 u'partnership',
 u'and',
 u'planning',
 u'some',
 u'upcoming',
 u'shoots',
 u'Busy',
 u'morning']

## III. Syntactic Analysis:

### 1. Part of Speech Tagging:
We use part of speech tagging here in order to detect N.A.V.A words (Nouns, Adjectives, Verbs, Adverbs) those are good candidates to carry emotions.

In [16]:
tagged_tweets = pos_tagging(tokenized_list)
tagged_tweets[10]

[(u'Good', 'JJ'),
 (u'morning', 'NN'),
 (u'Black', 'NNP'),
 (u'Eyed', 'NNP'),
 (u'Peas', 'NNP'),
 (u'in', 'IN'),
 (u'my', 'PRP$'),
 (u'ears', 'NNS'),
 (u'finalizing', 'VBG'),
 (u'new', 'JJ'),
 (u'partnership', 'NN'),
 (u'and', 'CC'),
 (u'planning', 'VBG'),
 (u'some', 'DT'),
 (u'upcoming', 'JJ'),
 (u'shoots', 'NNS'),
 (u'Busy', 'JJ'),
 (u'morning', 'NN')]

### 2. Dependency Parser:

In [36]:
# STANFORD VERSION : More accurate but is too slow:
import os
os.environ["STANFORD_MODELS"] = "/home/meryem/Downloads/stanford-parser-full-2016-10-31"
os.environ["STANFORD_PARSER"] = "/home/meryem/Downloads/stanford-parser-full-2016-10-31"
from nltk.parse.stanford import StanfordDependencyParser
dep_parser=StanfordDependencyParser(model_path="/home/meryem/Downloads/stanford-parser-full-2016-10-31/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
dependency_trees = []
tweets_list = tweets_no_contractions['text']
for i in range(0,len(tweets_list)):
    trees = [parse.tree() for parse in dep_parser.raw_parse(tweets_list[i])]
    result = dep_parser.raw_parse(tweets_list[i])
    dep = result.next()
    dependency_trees.append(list(dep.triples()))

On the sample data, we could not directly find interesting examples to show how our dependencies of interest (negation, adjectival complement and adverbial complement) are detected. That's why we will give a few examples that don't exist in the sample dataset but could be found in the whole dataset. 

In [48]:
dependency_trees_examples = []
trees_examples = []
examples = ['I am not happy','What a bad luck','I am struggling happily']
for i in range(0,len(examples)):
    trees_examples.append([parse.tree() for parse in dep_parser.raw_parse(examples[i])])
    result = dep_parser.raw_parse(examples[i])
    dep = result.next()
    dependency_trees_examples.append(list(dep.triples()))

In [51]:
dependency_trees_examples[0]

[((u'happy', u'JJ'), u'nsubj', (u'I', u'PRP')),
 ((u'happy', u'JJ'), u'cop', (u'am', u'VBP')),
 ((u'happy', u'JJ'), u'neg', (u'not', u'RB'))]

So here it has detected that happy depends on word not which cancels its emotion (happiness) as it has a negation dependency.  

In [52]:
dependency_trees_examples[1]

[((u'luck', u'NN'), u'dep', (u'What', u'WP')),
 ((u'luck', u'NN'), u'det', (u'a', u'DT')),
 ((u'luck', u'NN'), u'amod', (u'bad', u'JJ'))]

So here it has detected that luck depends on word bad which cancels its emotion (positive) as it has an adjectival modifier (amod) dependency.

In [53]:
dependency_trees_examples[2]

[((u'struggling', u'VBG'), u'nsubj', (u'I', u'PRP')),
 ((u'struggling', u'VBG'), u'aux', (u'am', u'VBP')),
 ((u'struggling', u'VBG'), u'advmod', (u'happily', u'RB'))]

So here it has detected that struggling depends on word happily which cancels its emotion as it has an adjectival modifier (advmod) dependency. After that, struggling happily will have the emotion of depender "happily".

In [57]:
# TEMPORARY SOLUTION FOR DEPENDENCY PARSING:
nlp = spacy.load('en')

In [58]:
docs = []
# Joining text:
tweets_text = []
for i in range(0, len(tokenized_list)):
    space = u" "
    tweets_text.append(space.join(tokenized_list[i]))
tweets_text[0].encode("utf-8")
for i in range(0, len(tweets_text)):
    doc = nlp(tweets_text[i])
    docs.append(doc)

In [59]:
docs[0]

Still the best coffee in town at La Stanza

In [60]:
new_samples = []
for sample in docs:
    new_samples_sub = []
    for word in sample:
        new_samples_sub.append((unicode(word),word.pos_))
    new_samples.append(new_samples_sub)

In [61]:
new_samples[0]

[(u'Still', u'ADV'),
 (u'the', u'DET'),
 (u'best', u'ADJ'),
 (u'coffee', u'NOUN'),
 (u'in', u'ADP'),
 (u'town', u'NOUN'),
 (u'at', u'ADP'),
 (u'La', u'PROPN'),
 (u'Stanza', u'PROPN')]

### Application of Syntactic Rules:

In [62]:
new_samples,triple_dependencies = apply_syntactic_rules(docs,new_samples)

UnicodeEncodeError: 'ascii' codec can't encode character u'\xfc' in position 3: ordinal not in range(128)

In [32]:
i = 0
print "\n<<<< Original tweet text >>>\n"
print tweets_text[i]
print "\n<<<< Syntactic dependencies >>>\n"
print triple_dependencies[i]
print "\n<<<< Tweet after applying syntactic Rules >>>\n"
new_tweet = []
for (word,pos) in new_samples[i]:
    new_tweet.append(word)
print new_tweet


<<<< Original tweet text >>>

So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits If it is unobserved it will however if it is observed after it is left the plane but before it hits its target it will not have gone through both slits

<<<< Syntactic dependencies >>>

[(So, u'advmod', observed), (if, u'mark', directed), (a, u'det', photon), (photon, u'nsubjpass', directed), (is, u'auxpass', directed), (directed, u'advcl', observed), (through, u'prep', directed), (a, u'det', plane), (plane, u'pobj', through), (with, u'prep', directed), (two, u'nummod', slits), (slits, u'pobj', with), (in, u'prep', slits), (it, u'pobj', in), (and, u'cc', directed), (either, u'advmod', slit), (slit, u'conj', directed), (is, u'auxpass', observed), (observed, u'ROOT', observed), (it, u'nsubj', go), (will, u'aux', go), (not, u'neg', go), (go, u'ccomp', observed), (through, u'prep', go), (both, u'det', slits), (slits, u'pobj', through),

In [17]:
print tagged_tweets[0]
print "\n"
print new_samples[0]

[(u'Still', 'RB'), (u'the', 'DT'), (u'best', 'JJS'), (u'coffee', 'NN'), (u'in', 'IN'), (u'town', 'NN'), (u'at', 'IN'), (u'La', 'NNP'), (u'Stanza', 'NNP')]


[(u'the', u'DET'), (u'best', u'ADJ'), (u'in', u'ADP'), (u'town', u'NOUN'), (u'at', u'ADP'), (u'La', u'PROPN'), (u'Stanza', u'PROPN')]


### Named Entity Tagging:

In [18]:
tweet_without_ne = remove_named_entities(new_samples)

### Normalizing POS tag

In [19]:
normalized_tags = normalize_pos_tags_words(tweet_without_ne)
normalized_tags[1]

[(u'ready', u'ADJ')]

### Removal of Punctuation and Stop words and Converting to Lower Case and Removal of Other special categories: url, number, username:

In [20]:
tagged_tweets_without = eliminate_stop_words_punct(normalized_tags)

### Lemmatization:

In [21]:
lemmatized_tweets = lemmatizer(tagged_tweets_without)

lemmatized_tweets_untag = lemmatizer_untagged(tagged_tweets_without)
lemmatized_tweets[0]

[(u'best', u'ADJ'), (u'town', 'n'), (u'la', u'PROPN'), (u'stanza', u'PROPN')]

### Keeping only NAVA words

In [22]:
nava_tweets = keep_only_nava_words(lemmatized_tweets)

In [24]:
nava_tweets[0]

[u'best', u'town']

In [25]:
nava_tweets_df = pd.DataFrame()
nava_tweets_df['Nava Tweets'] = nava_tweets
nava_tweets_df.to_csv('Results/nava representation.csv',index=False)

### Extracting NRC Lexicon:

In [24]:
# EXECUTE THIS TO DIRECTLY LOAD THE PROCESSED LEXICON
lexicon_df = pd.read_csv('NRCLexicon/lexicon_nrc.csv',encoding='utf-8')

In [18]:
lexicon = extractLexicon()
word_set = list(set(lexicon['Word']))
word_emotional_vectors = []
for word in word_set:
    word_emotional_vectors.append((word,list(lexicon[lexicon['Word']==word]['Score'])))
word_emotional_vectors_dict = dict(word_emotional_vectors)

In [19]:
lexicon.head(1)
word_emotional_vectors_dict['happy']

['0', '1', '0', '0', '1', '0', '1', '0', '0', '1']

In [20]:
emo_dict = {
    0: 'Anger',
    1: 'Anticipation',
    2: 'Disgust',
    3: 'Fear',
    4: 'Joy',
    5: 'Negative',
    6: 'Positive',
    7: 'Sadness',
    8: 'Surprise',
    9: 'Trust',
    10: 'Neutral'
}
emotion_ids = []
for word in word_emotional_vectors_dict.keys():
    for i in range(0,len(word_emotional_vectors_dict[word])):
        if word_emotional_vectors_dict[word][i] == '1':
            emotion_ids.append((word,i))

In [21]:
emotion_repres_words_list = [] 
for i in range(0,10):
    emotion_repres_words_list_sub = []
    for (word,emotion) in emotion_ids:
        if emotion == i:
            emotion_repres_words_list_sub.append(word)
    emotion_repres_words_list.append(emotion_repres_words_list_sub)
len(emotion_repres_words_list[5])

3324

In [22]:
lexicon_df = pd.DataFrame()
lexicon_df[0] = emotion_repres_words_list[0]
for i in range(1,10):
    df = pd.DataFrame()
    df[i] = emotion_repres_words_list[i]
    lexicon_df= pd.concat([lexicon_df,df],ignore_index=True,axis=1)

In [23]:
lexicon_df.to_csv('NRCLexicon/lexicon_nrc.csv',index=False)

In [8]:
import pandas as pd
lexicon_df = pd.read_csv('NRCLexicon/lexicon_nrc.csv')
lexicon_df.columns

Index([u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], dtype='object')

In [25]:
unique_lexicon = make_unique_lexicon(lexicon_df)

### Calculating Semantic Similarity using PMI:

In [26]:
flatten_list = [word for sublist in lemmatized_tweets for (word, tag) in sublist]

NameError: name 'lemmatized_tweets' is not defined

In [None]:
clean_pmi_dict = calculate_pmi(flatten_list,unique_lexicon)

In [None]:
emotion_pmi_based = compute_matrix_sentences_list(nava_tweets,lexicon_df, clean_pmi_dict)

In [None]:
emo_dict = {
    0: 'Anger',
    1: 'Anticipation',
    2: 'Disgust',
    3: 'Fear',
    4: 'Joy',
    #5: 'Negative',
    #6: 'Positive',
    5: 'Sadness',
    6: 'Surprise',
    7: 'Trust',
    8: 'Neutral'
}
sent_dict = {
    0: "Positive",
    1: "Negative",
    2: "Neutral"
}

In [None]:
# Emotion Recognition
sentence_vectors_pmi = compute_sentence_emotion_vectors(emotion_pmi_based)

emotionalities = compute_emotionalities(sentence_vectors_pmi)


# Sentiment Analysis
sentence_vectors_sent_pmi = compute_sentence_sentiment_vectors(emotion_pmi_based)

sentiments = compute_sentiments(sentence_vectors_sent_pmi)

### Visualizing the results

In [None]:
english_tweets['Affective Feature Representation'] = lemmatized_tweets
english_tweets['Emotion Ids'] = sentence_vectors_pmi
emotions = []
senti = []
for i in range(0,len(emotionalities)):
    emotions.append(emo_dict[emotionalities[i]])
    senti.append(sent_dict[sentiments[i]])
english_tweets['Emotionalities'] = emotions
english_tweets['Sentiments'] = senti

In [None]:
english_tweets_col = english_tweets[['text','Affective Feature Representation','Emotion Ids','Emotionalities','Sentiments']]
english_tweets_col.to_csv('PMI_Lexicon_ResultsSampleEnglishData.csv',encoding ="utf-8",index=False)

In [None]:
%matplotlib inline
from collections import Counter
frequency = Counter(english_tweets_col['Emotionalities'])
df = pd.DataFrame.from_dict(frequency, orient='index')
df.plot(kind='bar')

In [None]:
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
a=np.random.random(9)
cs=cm.Set1(np.arange(9)/9.)
f=plt.figure()
ax=f.add_subplot(111, aspect='equal')
patches, texts = plt.pie(df, colors=cs, startangle=90)
labels = df.index
p=plt.pie(df, colors=cs, labels = labels)
plt.axis('equal')
plt.tight_layout()
plt.title("PMI- Rule Based Fine-grained Emotion Distribution")
plt.show()

### Visualizing Sentiment Results

In [None]:
%matplotlib inline
from collections import Counter
frequency = Counter(english_tweets_col['Sentiments'])
df = pd.DataFrame.from_dict(frequency, orient='index')
df.plot(kind='bar')

In [None]:
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
a=np.random.random(3)
cs=cm.Set1(np.arange(3)/3.)
f=plt.figure()
ax=f.add_subplot(111, aspect='equal')
patches, texts = plt.pie(df, colors=cs, startangle=90)
labels = df.index
p=plt.pie(df, colors=cs, labels = labels)
plt.axis('equal')
plt.tight_layout()
plt.title("PMI- Rule Based Sentiment Emotion Distribution")
plt.show()

# II. Same approach using word2vec similarity

### 1. Training Word2Vec Model on the whole tweets:

In [None]:
raw_tokenized_lemma = lemmatizer_raw(tagged_tweets)
len(raw_tokenized_lemma)
raw_tokenized_lemma[0]

In [None]:
model = train_word2Vec_model(300, 40, 4, 10, 1e-3 , raw_tokenized_lemma, "geo_tweets_word2vec_model"): 

In [None]:
model.similarity('happy','so')

In [None]:
emotion_word2vec_based = compute_matrix_sentences_list_word2vec(lemmatized_tweets,lexicon_df,model)

In [None]:
emotion_word2vec_based

In [None]:
import math

# Emotion Recognition
sentence_vectors_word2vec = compute_sentence_emotion_vectors(emotion_word2vec_based)

emotionalities = compute_emotionalities(sentence_vectors_word2vec)



# Sentiment Analysis
sentence_vectors_sent_word2vec = compute_sentence_sentiment_vectors(emotion_word2vec_based)

sentiments = compute_sentiments(sentence_vectors_sent_word2vec)

### Storing Results

In [None]:
english_tweets['Affective Feature Representation'] = lemmatized_tweets
english_tweets['Emotion Ids'] = sentence_vectors_pmi
emotions = []
senti = []
for i in range(0,len(emotionalities)):
    emotions.append(emo_dict[emotionalities[i]])
    senti.append(sent_dict[sentiments[i]])
english_tweets['Emotionalities'] = emotions
english_tweets['Sentiments'] = senti

In [None]:
english_tweets_col = english_tweets[['text','Affective Feature Representation','Emotion Ids','Emotionalities','Sentiments']]
english_tweets_col.to_csv('Word2Vec_Lexicon_ResultsSampleEnglishData.csv',encoding ="utf-8",index=False)

### Visualizing the distribution

In [None]:
%matplotlib inline
from collections import Counter
frequency = Counter(english_tweets_col['Emotionalities'])
df = pd.DataFrame.from_dict(frequency, orient='index')
df.plot(kind='bar')

In [None]:
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
a=np.random.random(9)
cs=cm.Set1(np.arange(9)/9.)
f=plt.figure()
ax=f.add_subplot(111, aspect='equal')
patches, texts = plt.pie(df, colors=cs, startangle=90)
labels = df.index
p=plt.pie(df, colors=cs, labels = labels)
plt.axis('equal')
plt.tight_layout()
plt.title("PMI- Rule Based Fine-grained Emotion Distribution")
plt.show()

### Visualizing Sentiment Results

In [None]:
%matplotlib inline
from collections import Counter
frequency = Counter(english_tweets_col['Sentiments'])
df = pd.DataFrame.from_dict(frequency, orient='index')
df.plot(kind='bar')

In [None]:
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
a=np.random.random(3)
cs=cm.Set1(np.arange(3)/3.)
f=plt.figure()
ax=f.add_subplot(111, aspect='equal')
patches, texts = plt.pie(df, colors=cs, startangle=90)
labels = df.index
p=plt.pie(df, colors=cs, labels = labels)
plt.axis('equal')
plt.tight_layout()
plt.title("PMI- Rule Based Sentiment Emotion Distribution")
plt.show()