#### Simple Approach for Emotion Analysis

In this notebook, we apply a simple approach to capture the emotions expressed in tweets.
first, we clean the tweet by removing https links,hashtags, and mentions. then we remove stop words.
after that we use the NRC Emotion lexicon to look for emotion mapping for each word in a tweet and generate an emotion vector with 8 entries for each tweet, each entry in the vector represent a single emotion.

In [1]:
import pandas as pd
from textblob import TextBlob
import numpy as np
import  csv
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline 
import re
from nltk.corpus import stopwords
import nltk

In [14]:
## text cleaning

def clean_up_text(text):
    '''
    remove https links, hashtags and mentions
    '''
    hashtag = re.compile(r'[#]\w*')
    https = re.compile(r'https?:\/\/[a-zA-z0-9\/#%\.]+')
    mention = re.compile(r'[@]\w*')
    text = re.sub(hashtag, '', text)
    text = re.sub(https, '', text)
    text = re.sub(mention, '', text)
    return text


In [15]:
def remove_stop_words(text_list,stop_word_language):
    print("remove stop words start")
    c_t=[]
    for text in text_list:
        t=clean_up_text(text)
        c_t.append(t)
        
    nostopwords_lower_list=[]
    i=0
    for text in c_t:
        words_list=nltk.tokenize.word_tokenize(text)
        filtered_words = [word for word in words_list if word not in stopwords.words(stop_word_language)]
        filtered_words=' '.join(filtered_words)
        nostopwords_lower_list.append(filtered_words.lower())
        if (i%10000 == 0):
            print(i)
        i+=1
    return nostopwords_lower_list
    #do more text cleaning 
    
    
    
    

In [138]:
def emotion_analysis(tweet_file_path, language, stop_word_language):
    tweet=pd.read_csv(tweet_file_path)
    print("number of weets: ",len(tweet))
    #text_list=tweet['text'].tolist()
    #nostopwords_lower_list=remove_stop_words(text_list,stop_word_language)
    #tweet['cleaned_text']=nostopwords_lower_list
    
    
    nostopwords_lower_list=tweet['cleaned_text'].tolist()
    
    
    lec=pd.read_excel('NRC-Emotion-Lexicon-v0.92-InManyLanguages-web.xlsx')
    lec_language=lec[[language,'Positive','Negative','Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']]
    lec_language[language]=lec_language[language].str.lower()
    lec_language=lec_language.drop_duplicates(language)
    lec_language=lec_language.set_index([language])

    print("start emotion analysis")
    ee=0
    emotion_final_list=[]
    #y=1
    for text in nostopwords_lower_list:
        s=str(text).split(' ')
        #tweet=pd.DataFrame()
        emotion_list=[]
        for ss in s:
            if ss in lec_language.index:

            #row=lec_language.loc[lec_language[language] == ss]
                row=lec_language.loc[ss]


                ll=[row[0],row[1],row[2],row[3],
                    row[4],row[5],
                    row[6],row[7],row[8],row[9]]
                
                emotion_list.append(ll)

        #print("tweet: ", y)
        #y+=1
        #print(emotion_list)

        k=[sum(i) for i in zip(*emotion_list)]
        if(k==[]):
            emotion_final_list.append([0,0,0,0,0,0,0,0,0,0])
        else:
            emotion_final_list.append(k)
        ee+=1
        if (ee%100000 == 0):
            print(ee)
    
    tweet=pd.read_csv(tweet_file_path)
    tweet['cleaned_text']=nostopwords_lower_list
    
    headers=['Positive','Negative','Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']
    df = pd.DataFrame(emotion_final_list, columns=headers)
    
    
    tweet=pd.concat([tweet,df], axis=1)
    
    return tweet
    
    

# English Tweets

In [None]:
tweets_with_emptions=emotion_analysis('intermediate_data/en_cleaned.csv', 'English Word','english')

In [None]:
#tweets_with_emptions.to_csv('results/en_emotions.csv')

# French Tweets

In [None]:
tweets_with_emptions=emotion_analysis('intermediate_data/fr_cleaned.csv', 'French Translation (Google Translate)','french')

In [95]:
tweets_with_emptions.head()

Unnamed: 0,id,userId,createdAt,text,canton,language,cleaned_text,Positive,Negative,Anger,Anticipation,Disgust,Fear,Joy,Sadness,Surprise,Trust
0,9517199000.0,14393717.0,2010-02-23 08:02:57,Un peu de réconfort liquide en take away après...,VD,fr,un peu réconfort liquide take away après début...,1,0,0,0,0,0,0,0,0,1
1,9518015000.0,14393717.0,2010-02-23 08:40:13,Au charbon! (@ BCV St-François) http://4sq.com...,VD,fr,au charbon ! ( bcv st-françois ),0,0,0,0,0,0,0,0,0,0
2,9525122000.0,14465180.0,2010-02-23 13:20:45,C'est quoi un laptop geek? Un lapsus! :),VD,fr,c'est quoi laptop geek ? un lapsus ! : ),0,0,0,0,0,0,0,0,0,0
3,9567369000.0,6589882.0,2010-02-24 08:10:43,Dans le train pour Genève,VD,fr,dans train genève,0,0,0,0,0,0,0,0,0,0
4,9571348000.0,14393717.0,2010-02-24 11:06:29,"Argh, pas de phó! (@ Goûts d'Asie) http://4sq....",VD,fr,"argh , phó ! ( goûts d'asie )",0,0,0,0,0,0,0,0,0,0


In [97]:
tweets_with_emptions.to_csv('emotions_data/fr_final_emotion.csv',index=None)

# German Tweets

In [None]:
tweets_with_emotions=emotion_analysis('intermediate_data/de_cleaned.csv', 'German Translation (Google Translate)', 'german')

In [145]:
tweets_with_emotions.to_csv('emotions_data/de_final_emotion.csv',index=None)

# Italian Tweets

In [None]:
tweets_with_emotions=emotion_analysis('intermediate_data/it_cleaned.csv', 'Italian Translation (Google Translate)', 'italian')

In [155]:
tweets_with_emotions.to_csv('emotions_data/it_final_emotion.csv',index=None)

# Spanish Tweets

In [None]:
tweets_with_emptions=emotion_analysis('intermediate_data/es_cleaned.csv', 'Spanish Translation (Google Translate)', 'spanish')

In [159]:
tweets_with_emptions.to_csv('emotions_data/es_final_emotion.csv',index=None)

# portuguese Tweets

In [None]:
tweets_with_emptions=emotion_analysis('intermediate_data/pt_cleaned.csv', 'Portuguese Translation (Google Translate)', 'portuguese')

In [165]:
tweets_with_emptions.to_csv('emotions_data/pt_final_emotion.csv',index=None)

# Turkish Tweets

In [None]:
tweets_with_emptions=emotion_analysis('intermediate_data/tr_cleaned.csv', 'Turkish Translation (Google Translate)', 'turkish')

In [170]:
tweets_with_emptions.to_csv('emotions_data/tr_final_emotion.csv',index=None)

# Dutch Tweets

In [None]:
tweets_with_emptions=emotion_analysis('intermediate_data/nl_cleaned.csv', 'Dutch Translation (Google Translate)', 'dutch')

In [175]:
tweets_with_emptions.to_csv('emotions_data/nl_final_emotion.csv',index=None)

# Arabic Tweets

In [None]:
tweets_with_emptions=emotion_analysis('intermediate_data/ar_cleaned.csv', 'Arabic Translation (Google Translate)', 'arabic')

In [179]:
tweets_with_emptions.to_csv('emotions_data/ar_final_emotion.csv',index=None)

In [None]:
#concatnate all emotion files

In [180]:
import glob, os
os.chdir("emotions_data")

In [181]:

emotion_big_df=pd.DataFrame()
i=0
for file in glob.glob("*.csv"):
    name=file.split(".")[0]
    print(name)
    df=pd.read_csv(file)
    print(len(df))
    emotion_big_df=pd.concat([emotion_big_df,df], axis=0)
emotion_big_df.head()

ar_final_emotion
229191
de_final_emotion
1402170
en_final_emotion
2985994
es_final_emotion
443822
fr_final_emotion
3500272
it_final_emotion
484415
nl_final_emotion
197212
pt_final_emotion
476702
tr_final_emotion
331035


Unnamed: 0,id,userId,createdAt,text,canton,language,cleaned_text,Positive,Negative,Anger,Anticipation,Disgust,Fear,Joy,Sadness,Surprise,Trust
0,17566680000.0,14331452.0,2010-07-02 12:05:23,@alfarhan خطيبنا تحدث عن علاقة الآباء بالابناء...,BE,ar,خطيبنا تحدث علاقة الآباء بالابناء كيف الاسلام ...,0,0,0,0,0,0,0,0,0,0
1,20919660000.0,14331452.0,2010-08-11 21:52:55,للتو.. انتهينا من الراويح,BE,ar,للتو.. انتهينا الراويح,0,0,0,0,0,0,0,0,0,0
2,24080700000.0,14331452.0,2010-09-10 05:47:05,في الطريق الى صلاة الجمعة.. تقبل الله منا و من...,BE,ar,الطريق صلاة الجمعة.. تقبل الله منا منكم صالح ا...,2,0,0,1,0,1,1,0,0,1
3,24080730000.0,14331452.0,2010-09-10 05:47:46,عفوا قصدت صلاة العيد,BE,ar,عفوا قصدت صلاة العيد,1,0,0,1,0,0,1,0,1,1
4,26274650000.0,14331452.0,2010-10-03 15:12:59,افكار كثيرة لمشاريع تجول في خاطري لكن مع الاسف...,BE,ar,افكار كثيرة لمشاريع تجول خاطري الاسف الدراسة ا...,0,0,0,0,0,0,0,0,0,0


In [182]:
len(emotion_big_df)

10050813

In [183]:
emotion_big_df.to_csv('all_tweets_with_emotions.csv', index=None)