In [1]:
import pandas as pd
import sys
sys.path.insert(0, "/media/diskD/EPFL/Fall 2016/ADA/Project/GMR_ADA_Project/EmotionAnalysis")
from DataSchemaExtractionParsing import *
from DataPreProcessing import *
from SentSemanticModule import *
from SentTweetModule import *
from SentSyntacticModule import *
import math

### Loading English Data 

In [2]:
english_tweets = pd.read_csv("Data/en_sample.csv",encoding ="utf-8")
english_tweets.head()

Unnamed: 0,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,source,truncated,...,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation,swiss,canton,language
0,9514846412,7198282.0,2010-02-23 06:22:40,Still the best coffee in town — at La Stanza h...,8.53781,47.3678,\N,\N,550.0,,...,http://gowalla.com/,Nico Luchsinger,halbluchs,1820.0,703.0,4687.0,"Zurich, Switzerland",yes,ZH,en
1,9516952605,14703863.0,2010-02-23 07:51:47,Getting ready.. http://twitpic.com/14v8gz,8.81749,47.2288,\N,\N,62.0,,...,http://stone.com/Twittelator,Urs,ugro,75.0,161.0,1390.0,"Zürich, Switzerland",yes,SG,en
2,9517916537,13535402.0,2010-02-23 08:35:39,I'm at Online PC Magazin in Adliswil http://go...,8.5301,47.3152,\N,\N,550.0,,...,http://gowalla.com/,Patrick Hediger,hediger,1511.0,682.0,12157.0,"Zurich, Switzerland",yes,ZH,en
3,9519149278,14260616.0,2010-02-23 09:32:09,@eyeem When and how can we send photos ? One p...,8.29953,47.4829,\N,9518986782,1.0,,...,http://twitter.com/#!/download/iphone,Roman Keller,RomanKeller,720.0,821.0,7337.0,Switzerland,yes,AG,en
4,9523488851,12391922.0,2010-02-23 12:30:04,I just ousted @keepthebyte as the mayor of Day...,7.59,47.555,\N,\N,3.0,,...,http://foursquare.com,Gabriel Walt,GabrielWalt,1445.0,1627.0,1507.0,"Basel, Switzerland",yes,BS,en


In [3]:
len(english_tweets)

10000

In [4]:
english_tweets.columns

Index([u'id', u'userId', u'createdAt', u'text', u'longitude', u'latitude',
       u'placeId', u'inReplyTo', u'source', u'truncated', u'placeLatitude',
       u'placeLongitude', u'sourceName', u'sourceUrl', u'userName',
       u'screenName', u'followersCount', u'friendsCount', u'statusesCount',
       u'userLocation', u'swiss', u'canton', u'language'],
      dtype='object')

### Replacing Special Categories

In [5]:
replaced_categories = handle_special_categories(english_tweets)
len(replaced_categories)

10000

In [6]:
replaced_categories['text'].iloc[0]

u'Still the best coffee in town \u2014 at La Stanza '

### Replacing contractions (needed for more accurate tokenization)

In [7]:
tweets_no_contractions = replace_contractions(replaced_categories)

In [8]:
tweets_no_contractions['text'][0]

u'Still the best coffee in town \u2014 at La Stanza '

### Tokenization of Tweets into words

In [9]:
tokenized_list = bag_of_word_representation(tweets_no_contractions)

In [10]:
tokenized_list[0]

[u'Still', u'the', u'best', u'coffee', u'in', u'town', u'at', u'La', u'Stanza']

### Part of Speech Tagging:

In [11]:
tagged_tweets = pos_tagging(tokenized_list)
tagged_tweets[0]

[(u'Still', 'RB'),
 (u'the', 'DT'),
 (u'best', 'JJS'),
 (u'coffee', 'NN'),
 (u'in', 'IN'),
 (u'town', 'NN'),
 (u'at', 'IN'),
 (u'La', 'NNP'),
 (u'Stanza', 'NNP')]

### Dependency Parser

In [13]:
# STANFORD VERSION : More accurate but is too slow:
#import os
#os.environ["STANFORD_MODELS"] = "/home/meryem/Downloads/stanford-parser-full-2016-10-31"
#os.envir
#on["STANFORD_PARSER"] = "/home/meryem/Downloads/stanford-parser-full-2016-10-31"
#from nltk.parse.stanford import StanfordDependencyParser
#dep_parser=StanfordDependencyParser(model_path="/home/meryem/Downloads/stanford-parser-full-2016-10-31/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
#dependency_trees = []
#for tweet in tweets_no_contractions['text']:
    #result = dep_parser.raw_parse("No rest is detrimental")
    #dep = result.next()
    #dependency_trees.append(list(dep.triples()))

In [14]:
# TEMPORARY SOLUTION FOR DEPENDENCY PARSING:
# to install Run the following:
# pip install -U spacy
# python -m spacy.en.download all # for ENGLISH
# python -m spacy.de.download all # for Deutch
nlp = spacy.load('en')

In [15]:
docs = []
# Joining text:
tweets_text = []
for i in range(0, len(tokenized_list)):
    space = " "
    tweets_text.append(space.join(tokenized_list[i]))
tweets_text[0].encode("utf-8")
for i in range(0, len(tweets_text)):
    doc = nlp(tweets_text[i])
    docs.append(doc)

In [16]:
new_samples = []
for sample in docs:
    new_samples_sub = []
    for word in sample:
        new_samples_sub.append((unicode(word),word.pos_))
    new_samples.append(new_samples_sub)

### Application of Syntactic Rules:

In [17]:
new_samples = apply_syntactic_rules(docs,new_samples)

In [23]:
i = 18
print "\n<<<< Original tweet text >>>\n"
print tweets_text[i]
print "\n<<<< Tweet after applying syntactic Rules >>>\n"
#print new_samples[i]
new_tweet = []
for (word,pos) in new_samples[i]:
    new_tweet.append(word)
print new_tweet


<<<< Original tweet text >>>

massive miles more in april snowboarding swiss alps usertesting in cali ux intensive in amsterdam excited

<<<< Tweet after applying syntactic Rules >>>

[u'massive', u'miles', u'more', u'in', u'april', u'snowboarding', u'swiss', u'alps', u'usertesting', u'in', u'cali', u'ux', u'intensive', u'in', u'amsterdam', u'excited']


In [24]:
print tagged_tweets[0]
print "\n"
print new_samples[0]

[(u'Still', 'RB'), (u'the', 'DT'), (u'best', 'JJS'), (u'coffee', 'NN'), (u'in', 'IN'), (u'town', 'NN'), (u'at', 'IN'), (u'La', 'NNP'), (u'Stanza', 'NNP')]


[(u'the', u'DET'), (u'best', u'ADJ'), (u'in', u'ADP'), (u'town', u'NOUN'), (u'at', u'ADP'), (u'La', u'PROPN'), (u'Stanza', u'PROPN')]


### Named Entity Tagging:

In [26]:
tweet_without_ne = remove_named_entities(new_samples)

### Normalizing POS tag

In [27]:
normalized_tags = normalize_pos_tags_words(tweet_without_ne)
normalized_tags[1]

[(u'Getting', 'v'), (u'ready', u'ADJ')]

### Removal of Punctuation and Stop words and Converting to Lower Case and Removal of Other special categories: url, number, username:

In [28]:
tagged_tweets_without = eliminate_stop_words_punct(normalized_tags)

### Lemmatization:

In [29]:
lemmatized_tweets = lemmatizer(tagged_tweets_without)

lemmatized_tweets_untag = lemmatizer_untagged(tagged_tweets_without)
lemmatized_tweets[0]

[(u'best', u'ADJ'), (u'town', 'n'), (u'la', u'PROPN'), (u'stanza', u'PROPN')]

### Keeping only NAVA words

In [30]:
nava_tweets = keep_only_nava_words(lemmatized_tweets)

In [31]:
print lemmatized_tweets[0]
print nava_tweets[0]

[(u'best', u'ADJ'), (u'town', 'n'), (u'la', u'PROPN'), (u'stanza', u'PROPN')]
[u'best', u'town']


### Extracting NRC Lexicon:

In [32]:
lexicon = extractLexicon()
word_set = list(set(lexicon['Word']))
word_emotional_vectors = []
for word in word_set:
    word_emotional_vectors.append((word,list(lexicon[lexicon['Word']==word]['Score'])))
word_emotional_vectors_dict = dict(word_emotional_vectors)

In [33]:
lexicon.head(1)
word_emotional_vectors_dict['happy']

['0', '1', '0', '0', '1', '0', '1', '0', '0', '1']

In [34]:
emo_dict = {
    0: 'Anger',
    1: 'Anticipation',
    2: 'Disgust',
    3: 'Fear',
    4: 'Joy',
    5: 'Negative',
    6: 'Positive',
    7: 'Sadness',
    8: 'Surprise',
    9: 'Trust',
    10: 'Neutral'
}
emotion_ids = []
for word in word_emotional_vectors_dict.keys():
    for i in range(0,len(word_emotional_vectors_dict[word])):
        if word_emotional_vectors_dict[word][i] == '1':
            emotion_ids.append((word,i))

In [35]:
emotion_repres_words_list = [] 
for i in range(0,10):
    emotion_repres_words_list_sub = []
    for (word,emotion) in emotion_ids:
        if emotion == i:
            emotion_repres_words_list_sub.append(word)
    emotion_repres_words_list.append(emotion_repres_words_list_sub)
len(emotion_repres_words_list[5])

3324

In [36]:
lexicon_df = pd.DataFrame()
lexicon_df[0] = emotion_repres_words_list[0]
for i in range(1,10):
    df = pd.DataFrame()
    df[i] = emotion_repres_words_list[i]
    lexicon_df= pd.concat([lexicon_df,df],ignore_index=True,axis=1)

In [None]:
lexicon_df = pd.read_csv('lexicon_nrc.csv',encoding='utf-8')

In [None]:
lexicon_df.to_csv('lexicon_nrc.csv',encoding='utf-8',index=False)

In [37]:
unique_lexicon = make_unique_lexicon(lexicon_df)

### Calculating Semantic Similarity using PMI:

In [None]:
flatten_list = [word for sublist in lemmatized_tweets for (word, tag) in sublist]

In [None]:
clean_pmi_dict = calculate_pmi(flatten_list,unique_lexicon)

In [None]:
emotion_pmi_based = compute_matrix_sentences_list(nava_tweets,lexicon_df, clean_pmi_dict)

In [None]:
emo_dict = {
    0: 'Anger',
    1: 'Anticipation',
    2: 'Disgust',
    3: 'Fear',
    4: 'Joy',
    #5: 'Negative',
    #6: 'Positive',
    5: 'Sadness',
    6: 'Surprise',
    7: 'Trust',
    8: 'Neutral'
}
sent_dict = {
    0: "Positive",
    1: "Negative",
    2: "Neutral"
}

In [None]:
# Emotion Recognition
sentence_vectors_pmi = compute_sentence_emotion_vectors(emotion_pmi_based)

emotionalities = compute_emotionalities(sentence_vectors_pmi)


# Sentiment Analysis
sentence_vectors_sent_pmi = compute_sentence_sentiment_vectors(emotion_pmi_based)

sentiments = compute_sentiments(sentence_vectors_sent_pmi)

### Visualizing the results

In [None]:
english_tweets['Affective Feature Representation'] = lemmatized_tweets
english_tweets['Emotion Ids'] = sentence_vectors_pmi
emotions = []
senti = []
for i in range(0,len(emotionalities)):
    emotions.append(emo_dict[emotionalities[i]])
    senti.append(sent_dict[sentiments[i]])
english_tweets['Emotionalities'] = emotions
english_tweets['Sentiments'] = senti

In [None]:
english_tweets_col = english_tweets[['text','Affective Feature Representation','Emotion Ids','Emotionalities','Sentiments']]
english_tweets_col.to_csv('PMI_Lexicon_ResultsSampleEnglishData.csv',encoding ="utf-8",index=False)

In [None]:
%matplotlib inline
from collections import Counter
frequency = Counter(english_tweets_col['Emotionalities'])
df = pd.DataFrame.from_dict(frequency, orient='index')
df.plot(kind='bar')

In [None]:
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
a=np.random.random(9)
cs=cm.Set1(np.arange(9)/9.)
f=plt.figure()
ax=f.add_subplot(111, aspect='equal')
patches, texts = plt.pie(df, colors=cs, startangle=90)
labels = df.index
p=plt.pie(df, colors=cs, labels = labels)
plt.axis('equal')
plt.tight_layout()
plt.title("PMI- Rule Based Fine-grained Emotion Distribution")
plt.show()

### Visualizing Sentiment Results

In [None]:
%matplotlib inline
from collections import Counter
frequency = Counter(english_tweets_col['Sentiments'])
df = pd.DataFrame.from_dict(frequency, orient='index')
df.plot(kind='bar')

In [None]:
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
a=np.random.random(3)
cs=cm.Set1(np.arange(3)/3.)
f=plt.figure()
ax=f.add_subplot(111, aspect='equal')
patches, texts = plt.pie(df, colors=cs, startangle=90)
labels = df.index
p=plt.pie(df, colors=cs, labels = labels)
plt.axis('equal')
plt.tight_layout()
plt.title("PMI- Rule Based Sentiment Emotion Distribution")
plt.show()

# II. Same approach using word2vec similarity

### 1. Training Word2Vec Model on the whole tweets:

In [None]:
raw_tokenized_lemma = lemmatizer_raw(tagged_tweets)
len(raw_tokenized_lemma)
raw_tokenized_lemma[0]

In [None]:
train_word2Vec_model(300, 40, 4, 10, 1e-3 , raw_tokenized_lemma, "geo_tweets_word2vec_model"): 

In [None]:
model.similarity('happy','so')

In [None]:
emotion_word2vec_based = compute_matrix_sentences_list_word2vec(lemmatized_tweets,lexicon_df,model)

In [None]:
emotion_word2vec_based

In [None]:
import math

# Emotion Recognition
sentence_vectors_word2vec = compute_sentence_emotion_vectors(emotion_word2vec_based)

emotionalities = compute_emotionalities(sentence_vectors_word2vec)



# Sentiment Analysis
sentence_vectors_sent_word2vec = compute_sentence_sentiment_vectors(emotion_word2vec_based)

sentiments = compute_sentiments(sentence_vectors_sent_word2vec)

### Storing Results

In [None]:
english_tweets['Affective Feature Representation'] = lemmatized_tweets
english_tweets['Emotion Ids'] = sentence_vectors_pmi
emotions = []
senti = []
for i in range(0,len(emotionalities)):
    emotions.append(emo_dict[emotionalities[i]])
    senti.append(sent_dict[sentiments[i]])
english_tweets['Emotionalities'] = emotions
english_tweets['Sentiments'] = senti

In [None]:
english_tweets_col = english_tweets[['text','Affective Feature Representation','Emotion Ids','Emotionalities','Sentiments']]
english_tweets_col.to_csv('Word2Vec_Lexicon_ResultsSampleEnglishData.csv',encoding ="utf-8",index=False)

### Visualizing the distribution

In [None]:
%matplotlib inline
from collections import Counter
frequency = Counter(english_tweets_col['Emotionalities'])
df = pd.DataFrame.from_dict(frequency, orient='index')
df.plot(kind='bar')

In [None]:
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
a=np.random.random(9)
cs=cm.Set1(np.arange(9)/9.)
f=plt.figure()
ax=f.add_subplot(111, aspect='equal')
patches, texts = plt.pie(df, colors=cs, startangle=90)
labels = df.index
p=plt.pie(df, colors=cs, labels = labels)
plt.axis('equal')
plt.tight_layout()
plt.title("PMI- Rule Based Fine-grained Emotion Distribution")
plt.show()

### Visualizing Sentiment Results

In [None]:
%matplotlib inline
from collections import Counter
frequency = Counter(english_tweets_col['Sentiments'])
df = pd.DataFrame.from_dict(frequency, orient='index')
df.plot(kind='bar')

In [None]:
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
a=np.random.random(3)
cs=cm.Set1(np.arange(3)/3.)
f=plt.figure()
ax=f.add_subplot(111, aspect='equal')
patches, texts = plt.pie(df, colors=cs, startangle=90)
labels = df.index
p=plt.pie(df, colors=cs, labels = labels)
plt.axis('equal')
plt.tight_layout()
plt.title("PMI- Rule Based Sentiment Emotion Distribution")
plt.show()