# Features Explored

<h3><u>Note about the dataframes used when reading the dataset:</u></h3><br>
<b>`df[1]`</b> is used for selecting the tweet column in the dataframe throughout this script because there are no headers in the file. And if it is added, then these commands have to be replaced with the column names that are specified. Same goes for positional indexing in other columns during feature extraction - they have to be replaced with the corresponding column names

## Hashtag Distribution

In [1]:
import pandas as pd
import os
import re
import numpy as np
from collections import Counter
import nltk
import math
import pickle
import gensim
from textblob import Sentence
import csv
import string
from nltk.util import ngrams

delimiter = '\t'
dir_name = '/Users/nbarnaba/PycharmProjects/Keras_Samples/src'

In [2]:
def get_all_tweets_as_whole_text(df):
    texts = df[1].values
    whole_text = ''
    for each in texts:
        whole_text = whole_text + ' ' + each
    return whole_text

In [3]:
train_file_name = os.path.join(dir_name, '..', 'data', 'en_train', 'EI-reg-en_anger_train.txt')
df_anger = pd.read_csv(train_file_name, header=None, delimiter=delimiter)
anger_texts = get_all_tweets_as_whole_text(df_anger)
anger_hashtags = [each[0] for each in Counter(re.findall('#\w+', anger_texts)).most_common()]

In [4]:
train_file_name = os.path.join(dir_name, '..', 'data', 'en_train',  'EI-reg-en_fear_train.txt')
df_fear = pd.read_csv(train_file_name, header=None, delimiter=delimiter)
fear_texts = get_all_tweets_as_whole_text(df_fear)
fear_hashtags = [each[0] for each in Counter(re.findall('#\w+', fear_texts)).most_common()]

In [5]:
train_file_name = os.path.join(dir_name, '..', 'data', 'en_train',  'EI-reg-en_sadness_train.txt')
df_sadness = pd.read_csv(train_file_name, header=None, delimiter=delimiter)
sadness_texts = get_all_tweets_as_whole_text(df_sadness)
sadness_hashtags = [each[0] for each in Counter(re.findall('#\w+', sadness_texts)).most_common()]

In [6]:
train_file_name = os.path.join(dir_name, '..', 'data', 'en_train',  'EI-reg-en_joy_train.txt')
df_joy = pd.read_csv(train_file_name, header=None, delimiter=delimiter)
joy_texts = get_all_tweets_as_whole_text(df_joy)
joy_hashtags = [each[0] for each in Counter(re.findall('#\w+', joy_texts)).most_common()]

In [7]:
hashtags = anger_hashtags + joy_hashtags + sadness_hashtags + fear_hashtags

In [8]:
unique_hashtags = [each[0] for each in Counter(hashtags).most_common() if each[1]==1]

In [9]:
anger_hashtags = [each for each in anger_hashtags if each in unique_hashtags]
fear_hashtags = [each for each in fear_hashtags if each in unique_hashtags]
joy_hashtags = [each for each in joy_hashtags if each in unique_hashtags]
sadness_hashtags = [each for each in sadness_hashtags if each in unique_hashtags]
print('Anger : '+str(len(anger_hashtags)))
print('fear : '+str(len(fear_hashtags)))
print('joy : '+str(len(joy_hashtags)))
print('sadness : '+str(len(sadness_hashtags)))
print('unique : '+str(len(unique_hashtags)))
print('total : '+str(len(hashtags)))

Anger : 483
fear : 963
joy : 529
sadness : 582
unique : 2557
total : 3363


## Unicode Emo Hashtag

In [10]:
file_name = os.path.join(dir_name, '..','resources', 'NRC-Hashtag-Emotion-Lexicon-v0.2', 'NRC-Hashtag-Emotion-Lexicon-v0.2.txt')
df_emo_hashtag = pd.read_csv(file_name, header=None, delimiter=delimiter)
df_emo_hashtag = df_emo_hashtag[(df_emo_hashtag[0] == 'sadness') |
                               (df_emo_hashtag[0] == 'anger') | 
                               (df_emo_hashtag[0] == 'fear') | 
                               (df_emo_hashtag[0] == 'joy')]
print(df_emo_hashtag[(df_emo_hashtag[0] == 'anger') & (df_emo_hashtag[1] == 'pissed')][2].values[0])
print(df_emo_hashtag)

1.1371909298
          0                 1         2
3908   fear         #westbank  1.951954
3909   fear     #apprehension  1.951954
3910   fear            #su4mh  1.951954
3911   fear          aaaaaaah  1.951954
3912   fear              #ied  1.951954
3913   fear        #coldsweat  1.951954
3914   fear             #isaf  1.951954
3915   fear           cryotek  1.951954
3916   fear          #rushing  1.951954
3917   fear              #shy  1.951954
3918   fear     #apprehensive  1.951954
3919   fear            #mosul  1.951954
3920   fear          #fearful  1.951954
3921   fear           #ashdod  1.951954
3922   fear    #socialanxiety  1.951954
3923   fear        #backtrack  1.951954
3924   fear         terrifies  1.951954
3925   fear   #claustrophobia  1.951954
3926   fear            qassam  1.951954
3927   fear        #hezbollah  1.951954
3928   fear          #hamas25  1.951954
3929   fear              #dfw  1.951954
3930   fear           #feared  1.951954
3931   fear          #talib

In [11]:
emo_hashtag = df_emo_hashtag.as_matrix()
emo_hashtag

array([['fear', '#westbank', 1.95195360556393],
       ['fear', '#apprehension', 1.95195360556393],
       ['fear', '#su4mh', 1.95195360556393],
       ..., 
       ['joy', 'doctor', 0.0020351827030041197],
       ['joy', 'grad', 0.00113671004597709],
       ['joy', '1000', 0.00113671004597709]], dtype=object)

In [12]:
def filter_emo_hashtag(emotion, hashtag):
    temp = None
    for each in emo_hashtag: 
        if each[0] == emotion and (each[1] == hashtag):
            if temp is None:
                temp = each
            else:
                temp = np.concatenate((temp, each), axis=0)
    return temp
filter_emo_hashtag('joy', 'grad')

array(['joy', 'grad', 0.00113671004597709], dtype=object)

In [13]:
def get_hashtags(tweet_content):
    return re.findall('#[a-zA-Z]+', tweet_content)

In [14]:
def get_hashtag_emo_value(hashtag, emotion):
    emo_value = filter_emo_hashtag(emotion, hashtag)
    return emo_value[2] if emo_value is not None else 0

In [15]:
def get_emot_value_from_hashtag(tweet_content, emotion):
    hashtags = get_hashtags(tweet_content)
    if len(hashtags) > 0:
        _t_emo = []
        for each_hashtag in hashtags:
            _t_emo.append(get_hashtag_emo_value(each_hashtag, emotion))
    else:
        _t_emo = [0]
    return np.mean(_t_emo)

## Unicode Emoticon Lexicon

In [16]:
file_name = os.path.join(dir_name, '..','resources', 'NRC-Emoticon-Lexicon-v1.0', 'Emoticon-unigrams.txt')
emoticon_lexicon_unigram_df = pd.read_csv(file_name, header=None, delimiter=delimiter)

In [17]:
file_name = os.path.join(dir_name, '..','resources', 'NRC-Emoticon-Lexicon-v1.0', 'Emoticon-bigrams.txt')
emoticon_lexicon_bigram_df = pd.read_csv(file_name, header=None, delimiter=delimiter, quoting=csv.QUOTE_NONE, encoding='utf-8')

In [18]:
def get_ngrams(each_value, n_gram_value=1):
    def get_tuple(current_tuple):
        n_gram_value = str()
        for each in current_tuple:
            n_gram_value = str(each) if len(n_gram_value) == 0 else str(n_gram_value) + ' ' + str(each)
        return n_gram_value
    
    # if each_value is list convert it into type str
    if type(each_value) is list:
        _t = ''
        for each_ in each_value:
            _t += each_
        each_value = _t
    
    return [get_tuple(each) for each in ngrams(each_value.split(), n_gram_value)]

In [19]:
def get_emoticon_ngram_value(df, df_emoticon_lexicon, n_gram_value=1):
    df = df[1].str.replace(r"([a-zA-Z]+)([" + string.punctuation + ']+[^a-zA-Z])', '\\1 \\2').str.strip().str.lower()
    df = df.apply(lambda each_value: get_ngrams(each_value, n_gram_value))
    lexicon_scores_list = []
    for each_tweet in df:
        word_count = 0
        lexicon_score = 0
        for each_word in each_tweet:
            current_series = df_emoticon_lexicon[(df_emoticon_lexicon[0] == each_word)]
            if not current_series.empty: #Is empty
                lexicon_score += current_series.iloc[0][1]
                word_count +=1
        lexicon_score = 0 if word_count==0 \
            else (lexicon_score) / (word_count)
        lexicon_scores_list.append(lexicon_score)
    return np.array(lexicon_scores_list).reshape(len(lexicon_scores_list), 1)

## Phrase Vector (by averaging the constituent word vectors)

In [20]:
def load_word_vectors():
    global dir_name
    embedding_file_loc = os.path.join(dir_name, '..', 'resources', 'word2vec','GoogleNews-vectors-negative300.bin')
#     embedding_file_loc = os.path.join(dir_name, '..', 'resources', 'glove.6B', 'glove.')
    print("Loading the data file... Please wait...")
    word_model = gensim.models.KeyedVectors.load_word2vec_format(embedding_file_loc, binary=True)
    print("Successfully loaded 3.6 G bin file!")
    return word_model
word_model = load_word_vectors()

Loading the data file... Please wait...
Successfully loaded 3.6 G bin file!


In [21]:
def get_phrase_vector_obj(value):
    return PhraseVector(value)

In [22]:
class PhraseVector:
    def __init__(self, phrase):
        self.phrase = phrase
        self.vector = self.phrase_to_vec(phrase)
        self.pos_tag = self.get_words_in_phrase(phrase)

    @staticmethod
    def convert_vector_set_to_average(vector_set, ignore=[]):
        if len(ignore) == 0:
            return np.mean(vector_set, axis=0)
        else:
            return np.dot(np.transpose(vector_set), ignore) / sum(ignore)

    @staticmethod
    def get_unique_token_tags(vector1, vector2):
        tag_list = []
        for each_tag in vector1.pos_tag + vector2.pos_tag:
            if each_tag not in tag_list:
                tag_list.append(each_tag)
        return tag_list

    def phrase_to_vec(self, phrase):
        # _stop_words = stopwords.words("english")
        phrase = phrase.lower()
        verified_words = [word for word in phrase.split()]
        vector_set = []
        for each_word in verified_words:
            try:
                word_vector = word_model[each_word]
                vector_set.append(word_vector)
            except:
                pass
        return self.convert_vector_set_to_average(vector_set)

    def get_cosine_similarity(self, other_vector):
        cosine_similarity = np.dot(self.vector, other_vector.vector) / (
        np.linalg.norm(self.vector) * np.linalg.norm(other_vector.vector))
        try:
            if math.isnan(cosine_similarity):
                cosine_similarity = 0
        except:
            cosine_similarity = 0
        return cosine_similarity

    def get_words_in_phrase(self, phrase):
        if phrase.strip() == '':
            return []
        else:
            tagged_input = nltk.pos_tag(phrase.split(), tagset='universal')
            prev_item, prev_tag = tagged_input[0]
            g_item_list = [prev_item]
            cur_group_index = 0
            space = ' '
            revised_tag = []
            for cur_item, cur_tag in tagged_input[1:]:
                cur_item = cur_item.lower()
                if prev_tag is cur_tag:
                    g_item_list[cur_group_index] += space + cur_item
                else:
                    revised_tag.append((g_item_list[cur_group_index], prev_tag))
                    prev_tag = cur_tag
                    g_item_list.append(cur_item)
                    cur_group_index += 1
            revised_tag.append((g_item_list[cur_group_index], prev_tag))
            return revised_tag

In [23]:
def get_phrase_vectors(df, emotion):
    tweet_vectors_obj = None
    tweet_vectors = None
    labels = None

    tweet_vectors_obj = np.vectorize(get_phrase_vector_obj)(df[1].values)
    tweet_vectors = np.array([[]])
    for each_vector in tweet_vectors_obj:
        curr_vector = each_vector.vector
        if np.isnan(curr_vector).any():
            curr_vector = np.zeros(shape=(1, 300))
        else:
            curr_vector = curr_vector.reshape(1, len(each_vector.vector))
        if np.min(tweet_vectors.shape) == 0:
            tweet_vectors = np.concatenate((tweet_vectors, curr_vector), axis=1)
        else:
            tweet_vectors = np.concatenate((tweet_vectors, curr_vector), axis=0)
    labels = df[3].values

    return tweet_vectors, labels

## Polarity and Subjectivity using Textblob

In [24]:
def get_polarity_and_subjectivity(df, emotion):
    polarity_list = []
    subjectivity_list = []

    polarity_list = np.array(list(map(lambda x: Sentence(x).polarity, df[1].values)))
    subjectivity_list = np.array(list(map(lambda x: Sentence(x).subjectivity, df[1].values)))

    polarity_list = polarity_list.reshape(len(polarity_list),1)
    subjectivity_list = subjectivity_list.reshape(len(subjectivity_list),1)
    return polarity_list, subjectivity_list

## Concreteness Ratings

In [25]:
df_concrete = pd.read_csv(os.path.join(dir_name, '..', 'resources', 'Concreteness_ratings_Brysbaert_et_al_BRM.txt')
                 , delimiter=delimiter)

In [26]:
def get_concreteness_scores(df, df_concrete):
    # translation_table = dict.fromkeys(map(ord, string.punctuation), None)
    df = df[1].str.lower().str.split()
    concrete_correctness_values = []
    for each_tweet in df:
        word_count = 0
        current_correctness_score = 0
        for each_word in each_tweet:
            current_series = df_concrete[['Word','Conc.M']][(df_concrete['Word']==each_word)]
            if not current_series.empty: #Is empty
                current_correctness_score += current_series.iloc[0][1]
                word_count +=1
        current_correctness_score = 0 if word_count==0 \
            else (current_correctness_score) / (word_count)
        concrete_correctness_values.append(current_correctness_score)
    return np.array(concrete_correctness_values).reshape(len(concrete_correctness_values), 1)

## NRC-Emoticon-AffLexNegLex-v1.0 - Unigrams

In [27]:
resource_file_name = os.path.join(dir_name, '..', 'resources', 'NRC-Emoticon-AffLexNegLex-v1.0', 'Emoticon-AFFLEX-NEGLEX-unigrams.txt')
df_emoticon_affneg_uni = pd.read_csv(resource_file_name, header=None, delimiter=delimiter)

In [28]:
resource_file_name = os.path.join(dir_name, '..', 'resources', 'NRC-Emoticon-AffLexNegLex-v1.0', 'Emoticon-AFFLEX-NEGLEX-bigrams.txt')
df_emoticon_affneg_bi = pd.read_csv(resource_file_name, header=None, delimiter=delimiter, quoting=csv.QUOTE_NONE, encoding='utf-8')

`get_emoticon_ngram_value()` function is re-used for calculating the emoticon-affLexNegLex features

## NRC-Hashtag-Sentiment-AffLexNegLex-v1.0 - Unigrams

In [29]:
resource_file_name = os.path.join(dir_name, '..', 'resources', 'NRC-Hashtag-Sentiment-AffLexNegLex-v1.0', 'HS-AFFLEX-NEGLEX-unigrams.txt')
df_nrchashtag_afflexneglex_unigrams = pd.read_csv(resource_file_name, header=None, delimiter=delimiter)

In [30]:
def get_unigram_hashtag_afflexneglex(df, df_nrchashtag_afflexneglex_unigrams):
    hashtag_afflexneg_uni_features = []
    df = df[1].str.findall('#\w+').values
    for each_tweet in df:
        word_count = 0
        each_value = 0
        for each_hashtag in each_tweet:
            current_series = df_nrchashtag_afflexneglex_unigrams[(df_nrchashtag_afflexneglex_unigrams[0])==each_hashtag]
            if not current_series.empty:
                each_value += current_series[1].values[0]
                word_count += 1
        each_value = 0 if word_count == 0 else (each_value)/(word_count)
        hashtag_afflexneg_uni_features.append(each_value)
    return np.array(hashtag_afflexneg_uni_features).reshape(len(hashtag_afflexneg_uni_features), 1)

## NRC-Hashtag-Sentiment-AffLexNegLex - Bigrams

`def get_emoticon_afflex_bigram_value(df, df_emoticon_lexicon)` may contain some errors by exclusion. The regular expression used has not vigorously been tested

In [31]:
file_name = os.path.join(dir_name, '..','resources', 'NRC-Hashtag-Sentiment-AffLexNegLex-v1.0', 'HS-AFFLEX-NEGLEX-bigrams.txt')
df_hashtag_senti_afflex_bi = pd.read_csv(file_name, header=None, delimiter=delimiter, quoting=csv.QUOTE_NONE, encoding='utf-8')
df_hashtag_senti_afflex_bi[0]  = df_hashtag_senti_afflex_bi[0].str.lower().str.strip()

In [32]:
def get_emoticon_afflex_bigram_value(df, df_emoticon_lexicon):
    df = \
        df[1].str.replace(r"([a-zA-Z]+)([^\w\s]+)([^a-zA-Z])", '\\1 \\2 \\3') \
            .str.strip() \
            .str.lower() \
            .str.findall(r'[^\w\s]+(?:\s*#[a-zA-Z]+)+(?:\s*[^\w\s]+)?')
    df = df.apply(lambda each_value: get_ngrams(each_value, 2))
    lexicon_scores_list = []
    for each_tweet in df:
        word_count = 0
        lexicon_score = 0
        for each_word in each_tweet:
            current_series = df_emoticon_lexicon[(df_emoticon_lexicon[0] == each_word)]
            if not current_series.empty: #Is empty
                lexicon_score += current_series.iloc[0][1]
                word_count +=1
        lexicon_score = 0 if word_count==0 \
            else (lexicon_score) / (word_count)
        lexicon_scores_list.append(lexicon_score)
    return np.array(lexicon_scores_list).reshape(len(lexicon_scores_list), 1)

## NRC Sentiment Hashtag Sentiment Lexicon

`get_unigram_hashtag_afflexneglex` is used for calculating the unigram features for this feature. 

`get_emoticon_afflex_bigram_value` is used to calculate the bigram value for this feature.

In [33]:
file_name = os.path.join(dir_name, '..','resources', 'NRC-Hashtag-Sentiment-Lexicon-v1.0', 'HS-unigrams.txt')
df_hashtag_senti_uni = pd.read_csv(file_name, header=None, delimiter=delimiter, quoting=csv.QUOTE_NONE, encoding='utf-8')
df_hashtag_senti_uni[0]  = df_hashtag_senti_uni[0].str.lower().str.strip()

In [34]:
file_name = os.path.join(dir_name, '..','resources', 'NRC-Hashtag-Sentiment-Lexicon-v1.0', 'HS-bigrams.txt')
df_hashtag_senti_bi = pd.read_csv(file_name, header=None, delimiter=delimiter, quoting=csv.QUOTE_NONE, encoding='utf-8')
df_hashtag_senti_bi[0]  = df_hashtag_senti_bi[0].str.lower().str.strip()

## Anew Scores

In [35]:
df_anew = pd.read_csv(
    os.path.join(dir_name, '..', 'resources', 'Warriner, Kuperman, Brysbaert - 2013 BRM-ANEW expanded.tsv'),
    delimiter=delimiter)

In [36]:
def get_anew_scores(df, df_anew):
    # translation_table = dict.fromkeys(map(ord, string.punctuation), None)
    df = df[1].str.lower().str.split()
    anew_scores_list = np.array([[]])
    for each_tweet in df:
        word_count = 0
        current_anew_scores = np.zeros(3)
        for each_word in each_tweet:
            current_series = df_anew[['Word', 'V.Mean.Sum', 'A.Mean.Sum', 'D.Mean.Sum']][(df_anew['Word'] == each_word)]
            if not current_series.empty:
                current_anew_scores = np.add(current_anew_scores, current_series.iloc[0][1:].values)
                word_count +=1
        current_anew_scores = np.zeros(3) if word_count==0 \
            else (current_anew_scores) / (word_count)
        current_anew_scores = current_anew_scores.reshape(1, len(current_anew_scores))
        if np.min(anew_scores_list.shape) == 0:
            anew_scores_list = np.concatenate((anew_scores_list, current_anew_scores), axis=1)
        else:
            anew_scores_list = np.concatenate((anew_scores_list, current_anew_scores), axis=0)
    return anew_scores_list

## All Features Concatenated

In [37]:
def get_features(df, emotion):
    global df_emoticon_affneg_uni
    global df_emoticon_affneg_bi
    global df_nrchashtag_afflexneglex_unigrams
    global df_hashtag_senti_afflex_bi
    global df_concrete
    global df_anew
    global emoticon_lexicon_unigram_df
    global emoticon_lexicon_bigram_df
    global df_hashtag_senti_uni
    global df_hashtag_senti_bi
    polarity_list, subjectivity_list = get_polarity_and_subjectivity(df, emotion)
    tweet_vectors, labels = get_phrase_vectors(df, emotion)
    emoticon_lexicon_unigram_value = get_emoticon_ngram_value(df, emoticon_lexicon_unigram_df, n_gram_value=1)
    emoticon_lexicon_bigram_value = get_emoticon_ngram_value(df, emoticon_lexicon_bigram_df, n_gram_value=2)
    hashtag_feature_nrc_hashtag_emoticon = np.array([get_emot_value_from_hashtag(each, emotion) 
                                                       for each in df.as_matrix()[:,1]]).reshape(len(df), 1)
    nrc_emoticon_afflexneglex_unigrams = get_emoticon_ngram_value(df, df_emoticon_affneg_uni, n_gram_value=1)
    nrc_emoticon_afflexneglex_unigrams = get_emoticon_ngram_value(df, df_emoticon_affneg_bi, n_gram_value=2)
    hashtag_afflexneg_uni_features = get_unigram_hashtag_afflexneglex(df, df_nrchashtag_afflexneglex_unigrams)
    hashtag_afflexneg_bi_features = get_unigram_hashtag_afflexneglex(df, df_nrchashtag_afflexneglex_unigrams)
    concreteness_features = get_concreteness_scores(df, df_concrete)
    anew_features = get_anew_scores(df, df_anew)
    sentiment_lexicon_features_uni = get_unigram_hashtag_afflexneglex(df, df_hashtag_senti_uni)
    sentiment_lexicon_features_bi = get_emoticon_afflex_bigram_value(df, df_hashtag_senti_bi)
    return np.concatenate((polarity_list, 
                           subjectivity_list, 
                           tweet_vectors, 
                           emoticon_lexicon_unigram_value,
                           emoticon_lexicon_bigram_value,
                           hashtag_feature_nrc_hashtag_emoticon,
                           nrc_emoticon_afflexneglex_unigrams, 
                           hashtag_afflexneg_uni_features,
                           hashtag_afflexneg_bi_features,
                           concreteness_features, 
                           anew_features), axis=1)

In [38]:
def get_tokenized_ndarray_from_df(df):
    translation_table = dict.fromkeys(map(ord, string.punctuation), None)
    return df[1].str.lower().str.translate(translation_table).str.split().values

## Recurrent Neural Network

In [39]:
from numpy import asarray
from numpy import zeros
from keras.models import Sequential
from keras.layers import *
from keras.constraints import min_max_norm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import sys

max_length = -1

Using TensorFlow backend.


In [40]:
def get_embedding_parameters(features):
    vocab_size, embedded_vector_length = features.shape
    max_length = vocab_size
    embedding_matrix = features
    return vocab_size, embedded_vector_length, max_length, embedding_matrix

In [41]:
def get_embedding_model(vocab_size, embedded_vector_length, embedding_matrix, max_length, 
                  optimizer='adam',
                  loss='mean_squared_logarithmic_error'):
    model = Sequential()
    e = Embedding(vocab_size, embedded_vector_length, weights=[embedding_matrix], input_length=max_length, trainable=False)
    model.add(e)
    lstm = Bidirectional(LSTM(300, activation='relu',
                              kernel_initializer='random_uniform',
                              bias_initializer='zeros',
                              kernel_constraint=min_max_norm(min_value=-1.0, max_value=1.0, rate=0.5, axis=0),
                              bias_constraint=min_max_norm(min_value=-1.0, max_value=1.0, rate=0.5, axis=0)))
    model.add(lstm)
    model.add(Dense(300, activation='relu'))
#     # compile the model
#     model.compile(optimizer=optimizer, loss=loss)
#     # summarize the model
    print(model.summary())
    return model

In [42]:
def get_dense_model(vocab_size, embedded_vector_length, embedding_matrix, max_length, 
                  optimizer='adam',
                  loss='mean_squared_logarithmic_error'):
    model = Sequential()
    model.add(Dense(300, 
                    activation='relu', 
                    input_shape=(embedded_vector_length,)))
#     # compile the model
#     model.compile(optimizer=optimizer, loss=loss)
#     # summarize the model
    print(model.summary())
    return model

In [43]:
def get_rnn_model(embedding_models,
                  optimizer='adam',
                  loss='mean_squared_logarithmic_error',
                  output_activation='relu'):
    model = Sequential()
    model.add(Merge(embedding_models, mode='concat', concat_axis=1))
    model.add(Dense(1, activation=output_activation,
                    kernel_initializer='random_uniform',
                    bias_initializer='zeros',
                    kernel_constraint=min_max_norm(min_value=-1.0, max_value=1.0, rate=.5, axis=0),
                    bias_constraint=min_max_norm(min_value=-1.0, max_value=1.0, rate=0.5, axis=0)))
    # compile the model
    model.compile(optimizer=optimizer, loss=loss)
    # summarize the model
    print(model.summary())
    return model
    # fit the model

In [44]:
def write_to_file(tweet_ids, assgn_emotions, tweet_contents, predicted_scores, file_name):
    global dir_name
    if not os.path.exists(os.path.join(dir_name, '..', 'output')):
        os.makedirs(os.path.join(dir_name, '..', 'output'))
    with open(os.path.join(dir_name, '..', 'output', file_name), 'w') as f:
        file_writer = csv.writer(f, delimiter='\t')
        for each_tweet_id, each_tweet_content, each_emotion, each_score in \
                zip(tweet_ids, tweet_contents, assgn_emotions, predicted_scores):
            file_writer.writerow([each_tweet_id, each_tweet_content, each_emotion, each_score])

In [45]:
def build_vocab(df, embeddings_index=None):
    cleaned_tokenized = get_tokenized_ndarray_from_df(df)
    unique_tokens = set()
    for each_tweet in cleaned_tokenized:
        filtered_set = []
        if embeddings_index is None:
            filtered_set = each_tweet
        else:
            for each_word in each_tweet:
                if each_word in embeddings_index.vocab.keys():
                    filtered_set.append(each_word)
        unique_tokens = unique_tokens.union(set(filtered_set))
    unique_tokens = list(unique_tokens)
    vocab = {each_word : each_index+1 for each_index, each_word in enumerate(unique_tokens)}
    vocab['<unk>' ] = 0
    return vocab

In [46]:
def build_embedding_matrix(vocab, embeddings_index, embedded_vector_length=300):
    vocab_size = len(vocab)
    embedding_matrix = zeros((vocab_size, embedded_vector_length))
    for word in vocab.keys():
        if word in embeddings_index.vocab.keys():
            embedding_matrix[vocab[word]] = embeddings_index.syn0[embeddings_index.vocab[word].index]
    return embedding_matrix

In [47]:
def get_encode_docs(df, vocab):
    cleaned_tokenized = get_tokenized_ndarray_from_df(df)
    encoded_docs = []
    max_length = -1
    for each_tweet in cleaned_tokenized:
        encoded_tokenized_tweet = []
        for each_word in each_tweet:
            if each_word in vocab.keys():
                encoded_tokenized_tweet.append(vocab[each_word])
            else:
                encoded_tokenized_tweet.append(0)
        current_length = len(encoded_tokenized_tweet)
        if current_length > max_length:
            max_length = current_length
        encoded_docs.append(encoded_tokenized_tweet)
    return encoded_docs, max_length

In [48]:
def get_padded_docs(df, vocab, encoded_size=None):
    encoded_docs, max_length = get_encode_docs(df, vocab)
    padded_docs = np.array([[]])
    if encoded_size is not None:
        max_length = encoded_size
    for each_encoded_doc in encoded_docs:
        each_encoded_size = len(each_encoded_doc)
        each_encoded_doc = np.array(each_encoded_doc)
        if max_length >= each_encoded_size: 
            each_encoded_doc = np.array([np.pad(each_encoded_doc, (0, max_length-each_encoded_size), 'constant')])
        else:
            each_encoded_doc = np.array([each_encoded_doc[:max_length]])
        if padded_docs.size == 0:
            padded_docs = np.concatenate((padded_docs, each_encoded_doc), axis=1)
        else:
            padded_docs = np.concatenate((padded_docs, each_encoded_doc), axis=0)
    return padded_docs, max_length

In [None]:
char_level = False
docs = []
labels = []
tweet_ids = []
emotions = []

embedded_vector_length = 300
emotion_names = ['sadness', 'joy', 'anger', 'fear']
embedding_name = 'word2vec'
embeddings_index = word_model

for emot_id, emotion in enumerate(emotion_names):
    training_file_name = os.path.join(dir_name, '..','data','en_train','EI-reg-en_'+emotion+'_train.txt')
    df = pd.read_csv(training_file_name, header=None, delimiter=delimiter)
    tweet_ids_train, docs_train, emotions_train, label_train = [df[each].values for each in range(4)]

#     print('Loaded %s word vectors.' % len(embeddings_index))
    vocab = build_vocab(df, embeddings_index)
    print('Built vocabulary...')
    padded_docs_train, em_max_length = get_padded_docs(df, vocab)
    print('Padded docs for training is created...')
    vocab_size = len(vocab)
    embedding_matrix = build_embedding_matrix(vocab, embeddings_index)
    print('Loaded embedding matrix...')
    # define model with word embeddings
    _, embedded_vector_length = embedding_matrix.shape
    embedding_model = get_embedding_model(vocab_size, embedded_vector_length, embedding_matrix, em_max_length)
    # Added features
    current_feature = get_features(df, emotion)
    print('extra features loaded...')
    print('loaded word model...')
    vocab_size, embedded_vector_length, max_length, embedding_matrix = get_embedding_parameters(current_feature)

    phrase_vector_model = get_dense_model(vocab_size, embedded_vector_length, embedding_matrix, max_length)
    print('loaded phrase model...')
    model=get_rnn_model([embedding_model, phrase_vector_model])
    
    print(padded_docs_train.shape)
    print(current_feature.shape)
    
    model.fit([padded_docs_train, current_feature], label_train, verbose=1)
    
    print('model has been fit...')
    # dev set
    dev_file_name=os.path.join(dir_name, '..', 'data', 'en_dev', '2018-EI-reg-En-' + emotion + '-dev.txt')
    df = pd.read_csv(dev_file_name, header=None, delimiter=delimiter)
    tweet_ids, docs, emotions, labels = [df[each].values for each in range(4)]
    padded_docs_dev, _ = get_padded_docs(df, vocab, encoded_size = em_max_length)
    dev_features = get_features(df, emotion)
    np.savetxt(emotion+'_validation.tsv', dev_features, delimiter='\t')
    
    predicted_list = model.predict([padded_docs_dev, dev_features])
    write_to_file(tweet_ids, emotions, docs, labels, emotion + '_' + embedding_name + '_dev_labels')
    predicted_list = [each[0] for each in predicted_list]
    write_to_file(tweet_ids, emotions, docs, predicted_list, emotion+'_'+embedding_name+'_dev')

    print('Mean Squared Error of Validation Set: '+str(mean_squared_error(labels, predicted_list)))

Built vocabulary...
Padded docs for training is created...
Loaded embedding matrix...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 33, 300)           1249200   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 600)               1442400   
_________________________________________________________________
dense_4 (Dense)              (None, 300)               180300    
Total params: 2,871,900
Trainable params: 1,622,700
Non-trainable params: 1,249,200
_________________________________________________________________
None


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
