In [21]:
import pandas as pd
from EmotionAnalysis.DataSchemaExtractionParsing import *
from EmotionAnalysis.DataPreProcessing import *
from EmotionAnalysis.SentSemanticModule import *
from EmotionAnalysis.SentTweetModule import *
from EmotionAnalysis.SentSyntacticModule import *
import math

## Purpose of this notebook:
In this notebook, we demonstrate the different steps followed in order to come up with a refined representation of each tweet by following two principles:
* Word Qualification: application of stop word removal, part of speech tagging and named entity recognition and term normalization to keep good refined emotional candidates. 
* Inter-Word Relationships: Application of syntactic analysis to study three kinds of dependencies:
    * Negation Dependency: e.g. I am not happy 
    * Adjectival Dependency: 
    * Adverbial Dependency
    
#### NB:
This notebook makes direct calls to functions defined in EmotionAnalysis folder. Please refer to that in order to see details of implementation of different steps of the pipeline:
* Pre processing
* Syntactic Module
* Semantic Word Level Module
* Semantic Tweet Level Module

We have runned the same code on the whole dataset chunk by chunk with several variations depending on the language and the libraries available for that specific language. But, for demonstration purposes, in this notebook, we show the process for a small subset.

## I. Loading English Data 

In [3]:
english_tweets = pd.read_csv("../../Data/Sample Data/en_sample.csv",encoding = "ISO-8859-1",nrows=8000)
english_tweets.head()

Unnamed: 0,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,source,truncated,...,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation,swiss,canton,language
0,9514846412,7198282.0,2010-02-23 06:22:40,Still the best coffee in town â at La Stanza...,8.53781,47.3678,\N,\N,550.0,,...,http://gowalla.com/,Nico Luchsinger,halbluchs,1820.0,703.0,4687.0,"Zurich, Switzerland",yes,ZH,en
1,9516952605,14703863.0,2010-02-23 07:51:47,Getting ready.. http://twitpic.com/14v8gz,8.81749,47.2288,\N,\N,62.0,,...,http://stone.com/Twittelator,Urs,ugro,75.0,161.0,1390.0,"ZÃ¼rich, Switzerland",yes,SG,en
2,9517916537,13535402.0,2010-02-23 08:35:39,I'm at Online PC Magazin in Adliswil http://go...,8.5301,47.3152,\N,\N,550.0,,...,http://gowalla.com/,Patrick Hediger,hediger,1511.0,682.0,12157.0,"Zurich, Switzerland",yes,ZH,en
3,9519149278,14260616.0,2010-02-23 09:32:09,@eyeem When and how can we send photos ? One p...,8.29953,47.4829,\N,9518986782,1.0,,...,http://twitter.com/#!/download/iphone,Roman Keller,RomanKeller,720.0,821.0,7337.0,Switzerland,yes,AG,en
4,9523488851,12391922.0,2010-02-23 12:30:04,I just ousted @keepthebyte as the mayor of Day...,7.59,47.555,\N,\N,3.0,,...,http://foursquare.com,Gabriel Walt,GabrielWalt,1445.0,1627.0,1507.0,"Basel, Switzerland",yes,BS,en


Example of a tweet before applying any processing (we will use this example and several others to show how our steps refine an intermediary representation before applying any emotion recognition methodology):

In [4]:
english_tweets['text'].iloc[160]

u'Wow so cool !!! http://twitpic.com/17ean4 - Golden Gate Bridge, San Francisco #hipstamatic #iphoneography #sanfrancisco /via @P_McBride'

In [5]:
english_tweets['text'][9]

u"Read & Learn about scalability!!! A brief interview with me about how we're using @cassandra at @twitter: http://bit.ly/bBadzO /via @rk"

## II. Preliminary Pre-processing: 

### 1. Replacing Special Categories:
We first start by dealing with some string patterns that are particular to the case of Twitter Data:
* Urls: we detect and remove data as they don't carry any emotional importance
* Digits: we remove them for the same reason
* Detecting @ instances with <username> and removing it
* Removing hashtag # sign and keeping the word after hashtag

In [6]:
replaced_categories = handle_special_categories(english_tweets)

In [7]:
replaced_categories['text'].iloc[160]

u'Wow so cool !!!  - Golden Gate Bridge, San Francisco hipstamatic iphoneography sanfrancisco /via '

We notice here that url, username and hashtag sign have been removed

### 2. Replacing contractions (needed for more accurate tokenization)
e.g, "they're stunning" becomes "they are stunning". Otherwise, if we use tokenization right away, it will split into two words: they and 're and 're will cannot be removed since it is not part of stopwords.  

In [8]:
tweets_no_contractions = replace_contractions(replaced_categories)

In [9]:
tweets_no_contractions['text'][9]

u'Read & Learn about scalability!!! A brief interview with me about how we are using  at   /via '

Notice here "how we're" has been replaced by "how we are".

### 3. Tokenization of Tweets into words:

In [10]:
tokenized_list = bag_of_word_representation(tweets_no_contractions)
tokenized_list[10]

[u'Good',
 u'morning',
 u'Black',
 u'Eyed',
 u'Peas',
 u'in',
 u'my',
 u'ears',
 u'finalizing',
 u'new',
 u'partnership',
 u'and',
 u'planning',
 u'some',
 u'upcoming',
 u'shoots',
 u'Busy',
 u'morning']

## III. Syntactic Analysis:

### 1. Part of Speech Tagging:
We use part of speech tagging here in order to detect N.A.V.A. words (Nouns, Adjectives, Verbs, Adverbs) those are good candidates to carry emotions.

In [11]:
tagged_tweets = pos_tagging(tokenized_list)
tagged_tweets[10]

[(u'Good', 'JJ'),
 (u'morning', 'NN'),
 (u'Black', 'NNP'),
 (u'Eyed', 'NNP'),
 (u'Peas', 'NNP'),
 (u'in', 'IN'),
 (u'my', 'PRP$'),
 (u'ears', 'NNS'),
 (u'finalizing', 'VBG'),
 (u'new', 'JJ'),
 (u'partnership', 'NN'),
 (u'and', 'CC'),
 (u'planning', 'VBG'),
 (u'some', 'DT'),
 (u'upcoming', 'JJ'),
 (u'shoots', 'NNS'),
 (u'Busy', 'JJ'),
 (u'morning', 'NN')]

### 2. Dependency Parser:

In [22]:
# STANFORD VERSION : More accurate but is too slow:
import os
os.environ["STANFORD_MODELS"] = "/home/meryem/Downloads/stanford-parser-full-2016-10-31"
os.environ["STANFORD_PARSER"] = "/home/meryem/Downloads/stanford-parser-full-2016-10-31"
from nltk.parse.stanford import StanfordDependencyParser
dep_parser=StanfordDependencyParser(model_path="/home/meryem/Downloads/stanford-parser-full-2016-10-31/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
dependency_trees = []
tweets_list = tweets_no_contractions['text'][0:10]
for i in range(0,len(tweets_list)):
    trees = [parse.tree() for parse in dep_parser.raw_parse(tweets_list[i])]
    result = dep_parser.raw_parse(tweets_list[i])
    dep = result.next()
    dependency_trees.append(list(dep.triples()))

On the sample data, we could not directly find interesting examples to show how our dependencies of interest (negation, adjectival complement and adverbial complement) are detected. That's why we will give a few examples that don't exist in the sample dataset but could be found in the whole dataset. 

In [23]:
dependency_trees_examples = []
trees_examples = []
examples = ['I am not happy','What a bad luck','I am struggling happily']
for i in range(0,len(examples)):
    trees_examples.append([parse.tree() for parse in dep_parser.raw_parse(examples[i])])
    result = dep_parser.raw_parse(examples[i])
    dep = result.next()
    dependency_trees_examples.append(list(dep.triples()))

In [24]:
dependency_trees_examples[0]

[((u'happy', u'JJ'), u'nsubj', (u'I', u'PRP')),
 ((u'happy', u'JJ'), u'cop', (u'am', u'VBP')),
 ((u'happy', u'JJ'), u'neg', (u'not', u'RB'))]

So here it has detected that happy depends on word not which cancels its emotion (happiness) as it has a negation dependency.  

In [25]:
dependency_trees_examples[1]

[((u'luck', u'NN'), u'dep', (u'What', u'WP')),
 ((u'luck', u'NN'), u'det', (u'a', u'DT')),
 ((u'luck', u'NN'), u'amod', (u'bad', u'JJ'))]

So here it has detected that luck depends on word bad which cancels its emotion (positive) as it has an adjectival modifier (amod) dependency.

In [26]:
dependency_trees_examples[2]

[((u'struggling', u'VBG'), u'nsubj', (u'I', u'PRP')),
 ((u'struggling', u'VBG'), u'aux', (u'am', u'VBP')),
 ((u'struggling', u'VBG'), u'advmod', (u'happily', u'RB'))]

So here it has detected that struggling depends on word happily which cancels its emotion as it has an adjectival modifier (advmod) dependency. After that, struggling happily will have the emotion of depender "happily".

In [13]:
# TEMPORARY SOLUTION FOR DEPENDENCY PARSING:
nlp = spacy.load('en') # Loading nlp pipeline

In [14]:
# Creating docs
docs = []
# Joining text:
tweets_text = []
for i in range(0, len(tokenized_list)):
    space = u" "
    tweets_text.append(space.join(tokenized_list[i]))
tweets_text[0].encode("utf-8")
for i in range(0, len(tweets_text)):
    doc = nlp(tweets_text[i])
    docs.append(doc)

In [15]:
new_samples = []
for sample in docs:
    new_samples_sub = []
    for word in sample:
        new_samples_sub.append((unicode(word),word.pos_))
    new_samples.append(new_samples_sub)

### 3. Application of Syntactic Rules:

In [16]:
new_samples_syn,triple_dependencies_syn = apply_syntactic_rules(docs,new_samples)

In [17]:
i = 10
print "\n<<<< Original tweet text >>>\n"
print tweets_text[i]
print "\n<<<< Tweet after applying syntactic Rules >>>\n"
new_tweet = []
for (word,pos) in new_samples[i]:
    new_tweet.append(word)
print new_tweet


<<<< Original tweet text >>>

Good morning Black Eyed Peas in my ears finalizing new partnership and planning some upcoming shoots Busy morning

<<<< Tweet after applying syntactic Rules >>>

[u'Good', u'Black', u'Eyed', u'Peas', u'in', u'my', u'ears', u'finalizing', u'new', u'and', u'planning', u'some', u'upcoming', u'shoots', u'Busy']


In [18]:
print tagged_tweets[0]
print "\n"
print new_samples[0]

[(u'Still', 'RB'), (u'the', 'DT'), (u'best', 'JJS'), (u'coffee', 'NN'), (u'in', 'IN'), (u'town', 'NN'), (u'at', 'IN'), (u'La', 'NNP'), (u'Stanza', 'NNP')]


[(u'Still', u'ADV'), (u'the', u'DET'), (u'best', u'ADJ'), (u'in', u'ADP'), (u'town', u'NOUN'), (u'at', u'ADP'), (u'La', u'PROPN'), (u'Stanza', u'PROPN')]


## IV. Further Cleaning:

### 1. Named Entity Tagging:

In [27]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
tweet_without_ne = remove_named_entities(new_samples)

### 2. Normalizing POS tag:

In [28]:
normalized_tags = normalize_pos_tags_words1(tweet_without_ne)
normalized_tags[1]

[(u'Getting', 'v'), (u'ready', u'ADJ')]

### 3. Removal of Punctuation and Stop words and Converting to Lower Case:

In [29]:
tagged_tweets_without = eliminate_stop_words_punct(normalized_tags)

### 4. Lemmatization:
To normalize terms to one common version

In [30]:
lemmatized_tweets = lemmatizer(tagged_tweets_without)

lemmatized_tweets_untag = lemmatizer_untagged(tagged_tweets_without)

[(u'still', u'ADV'),
 (u'best', u'ADJ'),
 (u'town', 'n'),
 (u'la', u'PROPN'),
 (u'stanza', u'PROPN')]

In [33]:
lemmatized_tweets[1]

[(u'get', 'v'), (u'ready', u'ADJ')]

### 5.  Keeping only NAVA words

In [35]:
nava_tweets = keep_only_nava_words(lemmatized_tweets)
nava_tweets[0]

[u'still', u'best', u'town']

### 6. Lemmatizing Pre-cleaned Tokenized Tweets before any pre-processing:
We need this in order to train word2vec model as it will be impacted by the relative distance between words and we need the same lemmatized lower case version in order to calculate similarity scores for words that exist in nava tweets.

In [37]:
tokenized_lemmatized = lemmatizer_raw(normalize_pos_tags_words1(tagged_tweets))
tokenized_lemmatized[0]

[u'still', u'the', u'best', u'coffee', u'in', u'town', u'at', u'la', u'stanza']

### 6. Saving the lemmatized tokenized version and nava version of Tweets in a dataframe for later use:

In [40]:
nava_tweets_df = pd.DataFrame()
nava_tweets_df['Tokenized Lemmatized'] = tokenized_lemmatized
nava_tweets_df['Nava Representation'] = nava_tweets
nava_tweets_df.to_csv('../../Results/Sample Affective Representation.csv',index=False)

******************************* END *********************************************************