## Training Model for Multiple Languages:

In [73]:
# LOADING TOKENIZED REPRESENTATION
import pandas as pd
tokenized = pd.read_csv('../../Project_Backup/BigData/Unannotated_Representation/de/Unannotated_Representation1_sw.csv',encoding="ISO-8859-1")
tokenized1 = pd.read_csv('../../Project_Backup/BigData/Unannotated_Representation/de/Unannotated_Representation2_sw.csv',encoding="ISO-8859-1")
tokenized2 = pd.read_csv('../../Project_Backup/BigData/Unannotated_Representation/de/Unannotated_Representation3_sw.csv',encoding="ISO-8859-1")

from tqdm import tqdm
import ast
tokenized_lemma = tokenized['Tokenized Lemmatized']
tokenized_lemmatized_tweets = []
for i in tqdm(range(0, len(tokenized_lemma))):
    result = ast.literal_eval(tokenized_lemma[i])
    tokenized_lemmatized_tweets.append(result)

100%|███████████████████████████████| 331035/331035 [00:16<00:00, 19806.63it/s]


In [74]:
# PARAMETERS TO BE TUNED:

# Word vector dimensionality                      
# Minimum word count                        
# Number of threads to run in parallel
# Context window size                                                                                    
# Downsample setting for frequent words

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality
min_word_count = 1  # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print ("Training model...")
model = word2vec.Word2Vec(tokenized_lemmatized_tweets, workers=num_workers,
            size=num_features, min_count = min_word_count,
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "../../Project_Backup/BigData/Models/whole_tr_model"
model.save(model_name)

Training model...


2017-01-29 16:11:15,372 : INFO : collecting all words and their counts
2017-01-29 16:11:15,372 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-29 16:11:15,403 : INFO : PROGRESS: at sentence #10000, processed 84441 words, keeping 29071 word types
2017-01-29 16:11:15,434 : INFO : PROGRESS: at sentence #20000, processed 170345 words, keeping 49332 word types
2017-01-29 16:11:15,481 : INFO : PROGRESS: at sentence #30000, processed 255844 words, keeping 65974 word types
2017-01-29 16:11:15,528 : INFO : PROGRESS: at sentence #40000, processed 338568 words, keeping 80257 word types
2017-01-29 16:11:15,559 : INFO : PROGRESS: at sentence #50000, processed 413664 words, keeping 91880 word types
2017-01-29 16:11:15,590 : INFO : PROGRESS: at sentence #60000, processed 493851 words, keeping 104104 word types
2017-01-29 16:11:15,622 : INFO : PROGRESS: at sentence #70000, processed 569217 words, keeping 115007 word types
2017-01-29 16:11:15,668 : INFO : PROGRESS: at

In [1]:
from EmotionAnalysis.SentSemanticModule import *
from EmotionAnalysis.SentTweetModule import *

In [75]:
import pandas as pd
tokenized = pd.read_csv('../../Project_Backup/BigData/Unannotated_Representation/tr/Unannotated_Representation1_sw.csv',encoding="ISO-8859-1")

from tqdm import tqdm
import ast

nava_repr = list(tokenized['Nava without Stop Words'])
nava_repr[0]

"['hb']"

In [76]:
print ("Convert nava tweets")
# Convert nava_tweets 
nava_tweets = []
for i in tqdm(range(0, len(nava_repr))):
    result = ast.literal_eval(nava_repr[i])
    nava_tweets.append(result)
    #nava_tweets.append(nava_repr[i][1:len(nava_repr[i])-1].split(', '))
nava_tweets[0]

Convert nava tweets


100%|███████████████████████████████| 331035/331035 [00:09<00:00, 36335.72it/s]


['hb']

In [77]:
import pickle
print ("Loading Lexicon")
with open('NRCLexicon/turkish_lexicon.pickle', 'rb') as handle:
    lexicon_dict = pickle.load(handle)

Loading Lexicon


In [78]:
representative_set = []
for i in range(0,10):
    representative_set_sub = []
    for word in lexicon_dict.keys():
        if lexicon_dict[word][i] == 1: 
            representative_set_sub.append(word)
    representative_set.append(representative_set_sub)

In [79]:
lexicon_df = pd.DataFrame()
lexicon_df[0] = representative_set[0]
for i in range(1,10):
    df = pd.DataFrame()
    df[i] = representative_set[i]
    lexicon_df= pd.concat([lexicon_df,df],ignore_index=True,axis=1)

In [80]:
lexicon_df.to_csv('NRCLexicon/turkish_nrc.csv',index=False)
lexicon_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,kayıtsızlık,forewarned,kayıtsızlık,kayıtsızlık,faydalı,kayıtsızlık,üretici,kayıtsızlık,ortaya çıkarmak,haysiyet
1,aptallık,özgürlük,sapık,Alarm,yeşil,zoraki,haysiyet,aptallık,mistik,dayanak noktası
2,tütsü,macera,aptallık,hoşlanmama,özgürlük,sahipsiz,dayanak noktası,hariç,Alarm,faydalı
3,hoşlanmama,far,hoşlanmama,şanssızlık,muzaffer,meme,faydalı,körü körüne,özgürlük,yeşil
4,kovulma,tükenme,kovulma,kovulma,mezuniyet,sakınca,yeşil,apse,dava,sade
5,öfkeli,mezuniyet,kolera,pus,yürekli,sapık,şövalyelik,kasvetli,mezuniyet,kozmopolit
6,günah,piyango,çürüme,kolera,maaş,anayasaya aykırı,kozmopolit,kabul edilemez,hırsız,bayım
7,antitez,Komşuluk,saygısız,forewarned,tutkulu,yama,Bilişsel,şanssızlık,kararsız,sıkılık
8,dikkatsizlik,izlemek,dava,çürüme,çikolata,karartmak,ağırbaşlı,kovulma,kilitlenme,özgürlük
9,saygısız,hemen,kürtaj,saygısız,canlanma,aptallık,temiz,düşmek,pop,holding


In [81]:
lexicon_df = pd.read_csv('NRCLexicon/turkish_nrc.csv',encoding='ISO-8859-1')

In [9]:
###### STEP 3: Loading Word2Vec Model:
#print ("Loading Word2Vec")
#from gensim.models import word2vec
#model = word2vec.Word2Vec.load('../../Project_Backup/BigData/Models/whole_it_model')

Loading Word2Vec


In [82]:
model.similarity('insulto','pirata')

KeyError: 'insulto'

In [83]:
def compute_matrix_sentences_list_word2vec(nava_words, nrc_lexicon,model):
    """

    :param word2vec model:
    :param nava_words: we can pass any version of the bag of words
    :param nrc_lexicon:
    :return:
    """

    sm_list = list_nrc_lexicon(nrc_lexicon)
    emotions = nrc_lexicon.columns.values
    matrix_sentences_list = []
    for i in tqdm(range(0, len(nava_words))): # Iterate over all sentences
        " Initialize matrix for each sentence "
        w, h = len(nava_words[i]), 10
        matrix_sentence = [[0 for x in range(w)] for y in range(h)]
        k = 0
        for word in nava_words[i]: # Iterate over all words in the sentence
            j = 0
            for emotion in range(0, len(emotions)): # Iterate over all emotions => fill in the emotional vectors for all words
                total_similarity = 0
                for representative_word in sm_list[emotion][0:10]:
                    r = len(sm_list[emotion])
                    if word in model and representative_word in model:
                        total_similarity += model.similarity(word, representative_word)
                matrix_sentence[j][k] += total_similarity / r 
                j += 1 # increment index of representative words
            k += 1 # increment index of transcript words
        # append the matrix_sentence to the global list for all sentences
        matrix_sentences_list.append(matrix_sentence)
    return matrix_sentences_list


In [84]:
nava_tweets[0]

['hb']

In [85]:
###### STEP 4: Word Level
print ("Computing word level scores")
matrix_sentences_word2vec = compute_matrix_sentences_list_word2vec(nava_tweets,lexicon_df,model)

Computing word level scores


100%|█████████████████████████████████| 331035/331035 [44:36<00:00, 123.66it/s]


In [86]:
matrix_sentences_word2vec[1]

[[0.0057116875423981039, 0.0078047429297374085],
 [0.0060167928180368145, 0.01023575849751735],
 [0.0061844762299235389, 0.0093637660441997748],
 [0.0029643438294465266, 0.0043628413126492243],
 [0.010222436134512642, 0.016063174154950687],
 [0.0021053958348313797, 0.0035463607810704862],
 [0.0020871663320264386, 0.003241831617149262],
 [0.0039944016485211065, 0.0063264527756776373],
 [0.013639043814702283, 0.020052841149304629],
 [0.0058338785292868529, 0.0087626098500389483]]

In [87]:
nava_tweets[0]

['hb']

In [88]:
from __future__ import division

def compute_sentence_emotion_vectors(matrix_sentences_list):
    emotion_vector_list = []
    for i in tqdm(range(0, len(matrix_sentences_list))):
        sum_sentence = []
        ids = [0,1,2,3,4,7,8,9]
        for j in ids: # for each emotion
            sum_words = 0
            for k in range(0, len(matrix_sentences_list[i][j])):
                sum_words += matrix_sentences_list[i][j][k]*1000
            r = len(matrix_sentences_list[i])
            if r != 0 :
                sum_words = sum_words / r # Arithmetic mean
            sum_sentence.append(sum_words)
        emotion_vector_list.append(sum_sentence)
    return emotion_vector_list

def compute_sentence_sentiment_vectors(matrix_sentences_list):
    emotion_vector_list = []
    for i in tqdm(range(0, len(matrix_sentences_list))):
        sum_sentence = []
        ids = [5,6]
        for j in ids: # for each emotion
            sum_words = 0
            for k in range(0, len(matrix_sentences_list[i][j])):
                sum_words += matrix_sentences_list[i][j][k]*1000
            r = len(matrix_sentences_list[i])
            if r != 0 : 
                sum_words = sum_words / r
            sum_sentence.append(sum_words)
        emotion_vector_list.append(sum_sentence)
    return emotion_vector_list

def compute_emotionalities(sentence_vectors):
    emotionalities = []
    threshold = 0 # THRESHOLD PARAMETER TO BE FINE TUNED (0 for lexicon, 0.2 for pmi)
    for i in tqdm(range(0,len(sentence_vectors))):
        sentence_vector = sentence_vectors[i]
        mylist = [0 if math.isnan(x) else x for x in sentence_vector]
        if (max(mylist) > threshold): #Threshold 
            emotionalities.append(sentence_vectors[i].index(max(mylist)))
        else: 
            emotionalities.append(8)
    return emotionalities

def compute_sentiments(sentence_vectors_sent,emotionalities):
    sentiments = []
    threshold = 0 # THRESHOLD PARAMETER TO BE FINE TUNED (0 for lexicon, 0.2 for pmi)
    for i in tqdm(range(0,len(sentence_vectors_sent))):
        sentence_vector = sentence_vectors_sent[i]
        mylist = [0 if math.isnan(x) else x for x in sentence_vector]
        if (max(mylist) > threshold): #Threshold 
            sentiments.append(sentence_vectors_sent[i].index(max(mylist)))
        else:
            # To increase Recall, we also use emotionalities, in case a tweet is neutral
            if emotionalities[i] in [0,2,3,5]:
                sentiments.append(0) # Negative Emotion
            if emotionalities[i] in [1,4,6,7]:
                sentiments.append(1) # Positive Emotion
            if emotionalities[i] == 8:
                sentiments.append(2) # Otherwise, we just return Neutral
    return sentiments

In [89]:
###### STEP 5: Sentence Level:
print ("Computing Emotionalities")
# Emotion Recognition
sentence_vectors_word2vec = compute_sentence_emotion_vectors(matrix_sentences_word2vec)

emotionalities = compute_emotionalities(sentence_vectors_word2vec)


# Sentiment Analysis
print ("Computing sentiments")
sentence_vectors_sent = compute_sentence_sentiment_vectors(matrix_sentences_word2vec)

sentiments = compute_sentiments(sentence_vectors_sent,emotionalities)

###### FINAL STEP 6: Storing Emotion + Sentiment for each tweet

emo_dict = {
    0: 'Anger',
    1: 'Anticipation',
    2: 'Disgust',
    3: 'Fear',
    4: 'Joy',
    5: 'Sadness',
    6: 'Surprise',
    7: 'Trust',
    8: 'Neutral'
}
sent_dict = {
    0: "Negative",
    1: "Positive",
    2: "Neutral"
}

emotions = []
senti = []
for i in range(0,len(emotionalities)):
    emotions.append(emo_dict[emotionalities[i]])
    senti.append(sent_dict[sentiments[i]])

Computing Emotionalities


100%|███████████████████████████████| 331035/331035 [00:13<00:00, 24900.24it/s]
100%|██████████████████████████████| 331035/331035 [00:01<00:00, 293812.79it/s]


Computing sentiments


100%|██████████████████████████████| 331035/331035 [00:03<00:00, 101289.75it/s]
100%|██████████████████████████████| 331035/331035 [00:00<00:00, 484557.23it/s]


In [90]:
sentence_vectors_word2vec[4]

[1.4284738775261361,
 1.9759870158033888,
 1.7182788249045373,
 0.74582752934429064,
 3.1997340641994776,
 1.08907041512743,
 3.839270773118701,
 1.6812337177804835]

In [91]:
print ("Storing in dataframe")
word2vec_results_df = pd.DataFrame()

word2vec_results_df['Nava Tweet'] = nava_tweets

word2vec_results_df['Emotion Vectors'] = sentence_vectors_word2vec

word2vec_results_df['Emotion'] = emotions

word2vec_results_df['Sentiment Vectors'] = sentence_vectors_sent

word2vec_results_df['Sentiment'] = senti

word2vec_results_df.to_csv('../../Project_Backup/BigData/Word2VecBasedResults/tr/Tweets_Labelled_Word2Vec.csv')

Storing in dataframe


In [66]:
word2vec_results_df.head()

Unnamed: 0,Nava Tweet,Emotion Vectors,Emotion,Sentiment Vectors,Sentiment
0,"[iphoon, meten, nexus]","[0.884920438723, 2.30446919141, 1.0439055915, ...",Surprise,"[0.260638537939, 0.354363314745]",Negative
1,"[beginning, feel, home, eten, hangkant]","[1.23918403443, 2.73380914632, 1.25902353457, ...",Surprise,"[0.310734460599, 0.3274642242]",Negative
2,[ueeeeeeeeeeeeeooo],"[0.0674638401601, 0.135911464138, 0.0636923252...",Surprise,"[0.0186801057148, 0.0133107461327]",Positive
3,"[arzurich, developer, designer, geotaggers, al...","[3.67700226412, 8.89141266243, 4.24250307971, ...",Surprise,"[1.0491679129, 1.30976516478]",Negative
4,"[tapas, et, hoegaard, joli, apéro, chez, mathi...","[1.14908027863, 3.52168975666, 1.09623750354, ...",Joy,"[0.42267562281, 0.566264562012]",Negative
