## Training Model for Multiple Languages:

In [None]:
# LOADING TOKENIZED REPRESENTATION
import pandas as pd
tokenized = pd.read_csv('../../Project_Backup/BigData/Unannotated_Representation/de/Unannotated_Representation1_sw.csv',encoding="ISO-8859-1")
tokenized1 = pd.read_csv('../../Project_Backup/BigData/Unannotated_Representation/de/Unannotated_Representation2_sw.csv',encoding="ISO-8859-1")
tokenized2 = pd.read_csv('../../Project_Backup/BigData/Unannotated_Representation/de/Unannotated_Representation3_sw.csv',encoding="ISO-8859-1")

from tqdm import tqdm
import ast
tokenized_lemma = list(tokenized['Tokenized Lemmatized'])+list(tokenized1['Tokenized Lemmatized'])+list(tokenized2['Tokenized Lemmatized'])
tokenized_lemmatized_tweets = []
for i in tqdm(range(0, len(tokenized_lemma))):
    result = ast.literal_eval(tokenized_lemma[i])
    tokenized_lemmatized_tweets.append(result)

In [None]:
# PARAMETERS TO BE TUNED:

# Word vector dimensionality                      
# Minimum word count                        
# Number of threads to run in parallel
# Context window size                                                                                    
# Downsample setting for frequent words

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality
min_word_count = 1  # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print ("Training model...")
model = word2vec.Word2Vec(tokenized_lemmatized_tweets, workers=num_workers,
            size=num_features, min_count = min_word_count,
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "../../Project_Backup/BigData/Models/whole_de_model"
model.save(model_name)

In [None]:
from EmotionAnalysis.SentSemanticModule import *
from EmotionAnalysis.SentTweetModule import *

In [17]:
import pandas as pd
tokenized = pd.read_csv('../../Project_Backup/BigData/Unannotated_Representation/de/Unannotated_Representation1_sw.csv',encoding="ISO-8859-1")
tokenized1 = pd.read_csv('../../Project_Backup/BigData/Unannotated_Representation/de/Unannotated_Representation2_sw.csv',encoding="ISO-8859-1")

from tqdm import tqdm
import ast

nava_repr = list(tokenized['Nava without Stop Words'])+list(tokenized1['Nava without Stop Words'])
nava_repr[0]

'[feierabend, vfb, at, home, sweet, home]'

In [18]:
print ("Convert nava tweets")
# Convert nava_tweets 
nava_tweets = []
for i in tqdm(range(0, len(nava_repr))):
    #result = ast.literal_eval(nava_repr[i])
    #nava_tweets.append(result)
    nava_tweets.append(nava_repr[i][1:len(nava_repr[i])-1].split(', '))
nava_tweets[0]

Convert nava tweets


100%|████████████████████████████| 1000000/1000000 [00:02<00:00, 343163.26it/s]


['feierabend', 'vfb', 'at', 'home', 'sweet', 'home']

In [None]:
import pickle
print ("Loading Lexicon")
with open('NRCLexicon/german_lexicon.pickle', 'rb') as handle:
    lexicon_dict = pickle.load(handle)

In [None]:
representative_set = []
for i in range(0,10):
    representative_set_sub = []
    for word in lexicon_dict.keys():
        if lexicon_dict[word][i] == 1: 
            representative_set_sub.append(word)
    representative_set.append(representative_set_sub)

In [None]:
lexicon_df = pd.DataFrame()
lexicon_df[0] = representative_set[0]
for i in range(1,10):
    df = pd.DataFrame()
    df[i] = representative_set[i]
    lexicon_df= pd.concat([lexicon_df,df],ignore_index=True,axis=1)

In [None]:
lexicon_df.to_csv('NRCLexicon/german_nrc.csv',index=False)
lexicon_df

In [19]:
lexicon_df = pd.read_csv('NRCLexicon/german_nrc.csv',encoding='ISO-8859-1')

In [20]:
###### STEP 3: Loading Word2Vec Model:
print ("Loading Word2Vec")
from gensim.models import word2vec
model = word2vec.Word2Vec.load('../../Project_Backup/BigData/Models/whole_de_model')

Loading Word2Vec


In [None]:
model.similarity('insulto','pirata')

In [24]:
def compute_matrix_sentences_list_word2vec(nava_words, nrc_lexicon,model):
    """

    :param word2vec model:
    :param nava_words: we can pass any version of the bag of words
    :param nrc_lexicon:
    :return:
    """

    sm_list = list_nrc_lexicon(nrc_lexicon)
    emotions = nrc_lexicon.columns.values
    matrix_sentences_list = []
    for i in tqdm(range(0, len(nava_words))): # Iterate over all sentences
        " Initialize matrix for each sentence "
        w, h = len(nava_words[i]), 10
        matrix_sentence = [[0 for x in range(w)] for y in range(h)]
        k = 0
        for word in nava_words[i]: # Iterate over all words in the sentence
            j = 0
            for emotion in range(0, len(emotions)): # Iterate over all emotions => fill in the emotional vectors for all words
                total_similarity = 0
                for representative_word in sm_list[emotion][0:10]:
                    r = len(sm_list[emotion])
                    if word in model and representative_word in model:
                        total_similarity += model.similarity(word, representative_word)
                if word in sm_list[emotion]:
                    matrix_sentence[j][k] += 10
                else:
                    #matrix_sentence[j][k] += total_pmi / r
                    matrix_sentence[j][k] += total_similarity / r 
                j += 1 # increment index of representative words
            k += 1 # increment index of transcript words
        # append the matrix_sentence to the global list for all sentences
        matrix_sentences_list.append(matrix_sentence)
    return matrix_sentences_list


In [22]:
nava_tweets[0]

['feierabend', 'vfb', 'at', 'home', 'sweet', 'home']

In [None]:
###### STEP 4: Word Level
print ("Computing word level scores")
matrix_sentences_word2vec = compute_matrix_sentences_list_word2vec(nava_tweets,lexicon_df,model)

  1%|▎                               | 9441/1000000 [00:49<1:10:37, 233.78it/s]

In [None]:
matrix_sentences_word2vec[1]

In [None]:
nava_tweets[0]

In [None]:
from __future__ import division

def compute_sentence_emotion_vectors(matrix_sentences_list):
    emotion_vector_list = []
    for i in tqdm(range(0, len(matrix_sentences_list))):
        sum_sentence = []
        ids = [0,1,2,3,4,7,8,9]
        for j in ids: # for each emotion
            sum_words = 0
            for k in range(0, len(matrix_sentences_list[i][j])):
                sum_words += matrix_sentences_list[i][j][k]*1000
            r = len(matrix_sentences_list[i])
            if r != 0 :
                sum_words = sum_words / r # Arithmetic mean
            sum_sentence.append(sum_words)
        emotion_vector_list.append(sum_sentence)
    return emotion_vector_list

def compute_sentence_sentiment_vectors(matrix_sentences_list):
    emotion_vector_list = []
    for i in tqdm(range(0, len(matrix_sentences_list))):
        sum_sentence = []
        ids = [5,6]
        for j in ids: # for each emotion
            sum_words = 0
            for k in range(0, len(matrix_sentences_list[i][j])):
                sum_words += matrix_sentences_list[i][j][k]*1000
            r = len(matrix_sentences_list[i])
            if r != 0 : 
                sum_words = sum_words / r
            sum_sentence.append(sum_words)
        emotion_vector_list.append(sum_sentence)
    return emotion_vector_list

def compute_emotionalities(sentence_vectors):
    emotionalities = []
    threshold = 0 # THRESHOLD PARAMETER TO BE FINE TUNED (0 for lexicon, 0.2 for pmi)
    for i in tqdm(range(0,len(sentence_vectors))):
        sentence_vector = sentence_vectors[i]
        mylist = [0 if math.isnan(x) else x for x in sentence_vector]
        if (max(mylist) > threshold): #Threshold 
            emotionalities.append(sentence_vectors[i].index(max(mylist)))
        else: 
            emotionalities.append(8)
    return emotionalities

def compute_sentiments(sentence_vectors_sent,emotionalities):
    sentiments = []
    threshold = 0 # THRESHOLD PARAMETER TO BE FINE TUNED (0 for lexicon, 0.2 for pmi)
    for i in tqdm(range(0,len(sentence_vectors_sent))):
        sentence_vector = sentence_vectors_sent[i]
        mylist = [0 if math.isnan(x) else x for x in sentence_vector]
        if (max(mylist) > threshold): #Threshold 
            sentiments.append(sentence_vectors_sent[i].index(max(mylist)))
        else:
            # To increase Recall, we also use emotionalities, in case a tweet is neutral
            if emotionalities[i] in [0,2,3,5]:
                sentiments.append(0) # Negative Emotion
            if emotionalities[i] in [1,4,6,7]:
                sentiments.append(1) # Positive Emotion
            if emotionalities[i] == 8:
                sentiments.append(2) # Otherwise, we just return Neutral
    return sentiments

In [None]:
###### STEP 5: Sentence Level:
print ("Computing Emotionalities")
# Emotion Recognition
sentence_vectors_word2vec = compute_sentence_emotion_vectors(matrix_sentences_word2vec)

emotionalities = compute_emotionalities(sentence_vectors_word2vec)


# Sentiment Analysis
print ("Computing sentiments")
sentence_vectors_sent = compute_sentence_sentiment_vectors(matrix_sentences_word2vec)

sentiments = compute_sentiments(sentence_vectors_sent,emotionalities)

###### FINAL STEP 6: Storing Emotion + Sentiment for each tweet

emo_dict = {
    0: 'Anger',
    1: 'Anticipation',
    2: 'Disgust',
    3: 'Fear',
    4: 'Joy',
    5: 'Sadness',
    6: 'Surprise',
    7: 'Trust',
    8: 'Neutral'
}
sent_dict = {
    0: "Negative",
    1: "Positive",
    2: "Neutral"
}

emotions = []
senti = []
for i in range(0,len(emotionalities)):
    emotions.append(emo_dict[emotionalities[i]])
    senti.append(sent_dict[sentiments[i]])

In [None]:
sentence_vectors_word2vec[4]

In [None]:
print ("Storing in dataframe")
word2vec_results_df = pd.DataFrame()

word2vec_results_df['Nava Tweet'] = nava_tweets

word2vec_results_df['Emotion Vectors'] = sentence_vectors_word2vec

word2vec_results_df['Emotion'] = emotions

word2vec_results_df['Sentiment Vectors'] = sentence_vectors_sent

word2vec_results_df['Sentiment'] = senti

word2vec_results_df.to_csv('../../Project_Backup/BigData/Word2VecBasedResults/tr/Tweets_Labelled_Word2Vec.csv')

In [None]:
word2vec_results_df.head()