# Features Explored

## Hashtag Distribution

In [1]:
import pandas as pd
import os
import re
import numpy as np
from collections import Counter
import nltk
import math
import pickle
import gensim
from textblob import Sentence

delimiter = '\t'
dir_name = '/home/dennis/PycharmProjects/Keras_Samples/src'

In [None]:
get_all_tweets_as_whole_text(df):
    texts = df[1].values
    whole_text = ''
    for each in texts:
        whole_text = whole_text + ' ' + each
    return whole_text

In [2]:
train_file_name = os.path.join(dir_name, '..', 'data', 'en_train', 'EI-reg-en_anger_train.txt')
df_anger = pd.read_csv(train_file_name, header=None, delimiter=delimiter)
anger_texts = get_all_tweets_as_whole_text(df_anger)
anger_hashtags = [each[0] for each in Counter(re.findall('#\w+', whole_text)).most_common()]

In [3]:
train_file_name = os.path.join(dir_name, '..', 'data', 'en_train',  'EI-reg-en_fear_train.txt')
df_fear = pd.read_csv(train_file_name, header=None, delimiter=delimiter)
fear_texts = get_all_tweets_as_whole_text(df_fear)
fear_hashtags = [each[0] for each in Counter(re.findall('#\w+', whole_text)).most_common()]

In [4]:
train_file_name = os.path.join(dir_name, '..', 'data', 'en_train',  'EI-reg-en_sadness_train.txt')
df_sadness = pd.read_csv(train_file_name, header=None, delimiter=delimiter)
sadness_texts = get_all_tweets_as_whole_text(df_sadness)
sadness_hashtags = [each[0] for each in Counter(re.findall('#\w+', whole_text)).most_common()]

In [5]:
train_file_name = os.path.join(dir_name, '..', 'data', 'en_train',  'EI-reg-en_joy_train.txt')
df_joy = pd.read_csv(train_file_name, header=None, delimiter=delimiter)
joy_texts = get_all_tweets_as_whole_text(df_joy)
joy_hashtags = [each[0] for each in Counter(re.findall('#\w+', whole_text)).most_common()]

In [6]:
hashtags = anger_hashtags + joy_hashtags + sadness_hashtags + fear_hashtags

In [7]:
unique_hashtags = [each[0] for each in Counter(hashtags).most_common() if each[1]==1]

In [8]:
anger_hashtags = [each for each in anger_hashtags if each in unique_hashtags]
fear_hashtags = [each for each in fear_hashtags if each in unique_hashtags]
joy_hashtags = [each for each in joy_hashtags if each in unique_hashtags]
sadness_hashtags = [each for each in sadness_hashtags if each in unique_hashtags]
print('Anger : '+str(len(anger_hashtags)))
print('fear : '+str(len(fear_hashtags)))
print('joy : '+str(len(joy_hashtags)))
print('sadness : '+str(len(sadness_hashtags)))
print('unique : '+str(len(unique_hashtags)))
print('total : '+str(len(hashtags)))

Anger : 483
fear : 963
joy : 529
sadness : 582
unique : 2557
total : 3363


## Unicode Emo Hashtag

In [9]:
file_name = os.path.join(dir_name, '..','resources', 'NRC-Hashtag-Emotion-Lexicon-v0.2', 'NRC-Hashtag-Emotion-Lexicon-v0.2.txt')
df_emo_hashtag = pd.read_csv(file_name, header=None, delimiter=delimiter)
df_emo_hashtag = df_emo_hashtag[(df_emo_hashtag[0] == 'sadness') |
                               (df_emo_hashtag[0] == 'anger') | 
                               (df_emo_hashtag[0] == 'fear') | 
                               (df_emo_hashtag[0] == 'joy')]
print(df_emo_hashtag[(df_emo_hashtag[0] == 'anger') & (df_emo_hashtag[1] == 'pissed')][2].values[0])
print(df_emo_hashtag)

1.1371909298
          0                 1         2
3908   fear         #westbank  1.951954
3909   fear     #apprehension  1.951954
3910   fear            #su4mh  1.951954
3911   fear          aaaaaaah  1.951954
3912   fear              #ied  1.951954
3913   fear        #coldsweat  1.951954
3914   fear             #isaf  1.951954
3915   fear           cryotek  1.951954
3916   fear          #rushing  1.951954
3917   fear              #shy  1.951954
3918   fear     #apprehensive  1.951954
3919   fear            #mosul  1.951954
3920   fear          #fearful  1.951954
3921   fear           #ashdod  1.951954
3922   fear    #socialanxiety  1.951954
3923   fear        #backtrack  1.951954
3924   fear         terrifies  1.951954
3925   fear   #claustrophobia  1.951954
3926   fear            qassam  1.951954
3927   fear        #hezbollah  1.951954
3928   fear          #hamas25  1.951954
3929   fear              #dfw  1.951954
3930   fear           #feared  1.951954
3931   fear          #talib

In [10]:
emo_hashtag = df_emo_hashtag.as_matrix()
emo_hashtag

array([['fear', '#westbank', 1.95195360556393],
       ['fear', '#apprehension', 1.95195360556393],
       ['fear', '#su4mh', 1.95195360556393],
       ..., 
       ['joy', 'doctor', 0.0020351827030041197],
       ['joy', 'grad', 0.00113671004597709],
       ['joy', '1000', 0.00113671004597709]], dtype=object)

In [11]:
def filter_emo_hashtag(emotion, hashtag):
    temp = None
    for each in emo_hashtag: 
        if each[0] == emotion and (each[1] == hashtag):
            if temp is None:
                temp = each
            else:
                temp = np.concatenate((temp, each), axis=0)
    return temp
filter_emo_hashtag('joy', 'grad')

array(['joy', 'grad', 0.00113671004597709], dtype=object)

In [12]:
def get_hashtags(tweet_content):
    return re.findall('#[a-zA-Z]+', tweet_content)

In [13]:
def get_hashtag_emo_value(hashtag, emotion):
    emo_value = filter_emo_hashtag(emotion, hashtag)
    return emo_value[2] if emo_value is not None else 0

In [14]:
def get_emot_value_from_hashtag(tweet_content, emotion):
    hashtags = get_hashtags(tweet_content)
    if len(hashtags) > 0:
        _t_emo = []
        for each_hashtag in hashtags:
            _t_emo.append(get_hashtag_emo_value(each_hashtag, emotion))
    else:
        _t_emo = [0]
    return np.mean(_t_emo)

In [15]:
anger_hashtag_feature_nrc_hashtag_emoticon = np.array([get_emot_value_from_hashtag(each, 'anger') 
                                                       for each in df_anger.as_matrix()[:,1]]).reshape(len(df_anger), 1)
joy_hashtag_feature_nrc_hashtag_emoticon = np.array([get_emot_value_from_hashtag(each, 'joy') 
                                                     for each in df_joy.as_matrix()[:,1]]).reshape(len(df_joy), 1)
fear_hashtag_feature_nrc_hashtag_emoticon = np.array([get_emot_value_from_hashtag(each, 'fear')
                                                      for each in df_fear.as_matrix()[:,1]]).reshape(len(df_fear), 1)
sadness_hashtag_feature_nrc_hashtag_emoticon = np.array([get_emot_value_from_hashtag(each, 'sadness') 
                                                         for each in df_sadness.as_matrix()[:,1]]).reshape(len(df_sadness), 1)

In [16]:
print(anger_hashtag_feature_nrc_hashtag_emoticon.shape)
print(joy_hashtag_feature_nrc_hashtag_emoticon.shape)
print(fear_hashtag_feature_nrc_hashtag_emoticon.shape)
print(sadness_hashtag_feature_nrc_hashtag_emoticon.shape)

(1701, 1)
(1616, 1)
(2252, 1)
(1533, 1)


## Unicode Emoticon Lexicon

In [17]:
import string
translation_table = dict.fromkeys(map(ord, string.punctuation), None)

In [18]:
file_name = os.path.join(dir_name, '..','resources', 'NRC-Emoticon-Lexicon-v1.0', 'Emoticon-unigrams.txt')
df_emo_words = pd.read_csv(file_name, header=None, delimiter=delimiter)
df_emo_words[0] = df_emo_words[0].str.lower().str.translate(translation_table)
emo_words = df_emo_words.as_matrix() 
print(emo_words)

[['jeffreydonovan' 5.0 6 0]
 ['familar' 5.0 6 0]
 ['vppatel2011' 5.0 6 0]
 ..., 
 ['clarianne' -4.999 0 5]
 ['scrambling' -4.999 0 8]
 ['ballsed' -4.999 0 6]]


In [19]:
def filter_emo_word(word):
    temp = None
    for each in emo_words: 
        if each[0] == word:
            if temp is None:
                temp = each
            else:
                temp = np.concatenate((temp, each), axis=0)
    return temp
print (filter_emo_word('ballsed')[1])

-4.999


In [20]:
def get_emoticon_lexicon_value(df):
    translation_table = dict.fromkeys(map(ord, string.punctuation), None)
    cleaned_tokenized = df[1].str.lower().str.translate(translation_table).str.split().values
    emoticon_lexicon_value = []
    for each_tweet in cleaned_tokenized:
        emoti_word_count = 0
        emoti_value = 0
        for each_word in each_tweet:

            tmp = filter_emo_word(each_word)
            if tmp is not None:
                emoti_value += tmp[1]
                emoti_word_count += 1
        emoti_value = (emoti_value)/(emoti_word_count+1)
        emoticon_lexicon_value.append(emoti_value)
    return emoticon_lexicon_value

In [21]:
anger_emoticon_lexicon_value = get_emoticon_lexicon_value(df_anger)
joy_emoticon_lexicon_value = get_emoticon_lexicon_value(df_joy)
sadness_emoticon_lexicon_value = get_emoticon_lexicon_value(df_sadness)
fear_emoticon_lexicon_value = get_emoticon_lexicon_value(df_fear)

In [22]:
joy_emoticon_lexicon_value = np.array(joy_emoticon_lexicon_value).reshape(len(joy_emoticon_lexicon_value),1)
anger_emoticon_lexicon_value = np.array(anger_emoticon_lexicon_value).reshape(len(anger_emoticon_lexicon_value),1)
fear_emoticon_lexicon_value = np.array(fear_emoticon_lexicon_value).reshape(len(fear_emoticon_lexicon_value),1)
sadness_emoticon_lexicon_value = np.array(sadness_emoticon_lexicon_value).reshape(len(sadness_emoticon_lexicon_value),1)

## Phrase Vector (by averaging the constituent word vectors)

In [29]:
word_model = None

In [30]:
def load_word_vectors():
    global word_model
    global dir_name
    if not os.path.exists(os.path.join(dir_name, '..', 'output')):
        os.makedirs(os.path.join(dir_name, '..', 'output'))
    model_filename = 'GoogleWord2Vec'
    model_filename = os.path.join(dir_name, '..', 'resources', model_filename)
    if not os.path.exists(model_filename):
        embedding_file_loc = os.path.join(dir_name, '..', 'resources', 'GoogleNews-vectors-negative300.bin')
        print("Loading the data file... Please wait...")
        word_model = gensim.models.KeyedVectors.load_word2vec_format(embedding_file_loc, binary=True)
        print("Successfully loaded 3.6 G bin file!")
        pickle.dump(word_model, open(model_filename, 'wb'))
    else:
        word_model = pickle.load(open(model_filename, 'rb'))
        print('Successfully Loaded the model')

In [31]:
def get_phrase_vector_obj(value):
    return PhraseVector(value)

In [32]:
class PhraseVector:
    def __init__(self, phrase):
        self.phrase = phrase
        self.vector = self.phrase_to_vec(phrase)
        self.pos_tag = self.get_words_in_phrase(phrase)

    @staticmethod
    def convert_vector_set_to_average(vector_set, ignore=[]):
        if len(ignore) == 0:
            return np.mean(vector_set, axis=0)
        else:
            return np.dot(np.transpose(vector_set), ignore) / sum(ignore)

    @staticmethod
    def get_unique_token_tags(vector1, vector2):
        tag_list = []
        for each_tag in vector1.pos_tag + vector2.pos_tag:
            if each_tag not in tag_list:
                tag_list.append(each_tag)
        return tag_list

    def phrase_to_vec(self, phrase):
        # _stop_words = stopwords.words("english")
        phrase = phrase.lower()
        verified_words = [word for word in phrase.split()]
        vector_set = []
        for each_word in verified_words:
            try:
                word_vector = word_model[each_word]
                vector_set.append(word_vector)
            except:
                pass
        return self.convert_vector_set_to_average(vector_set)

    def get_cosine_similarity(self, other_vector):
        cosine_similarity = np.dot(self.vector, other_vector.vector) / (
        np.linalg.norm(self.vector) * np.linalg.norm(other_vector.vector))
        try:
            if math.isnan(cosine_similarity):
                cosine_similarity = 0
        except:
            cosine_similarity = 0
        return cosine_similarity

    def get_words_in_phrase(self, phrase):
        if phrase.strip() == '':
            return []
        else:
            tagged_input = nltk.pos_tag(phrase.split(), tagset='universal')
            prev_item, prev_tag = tagged_input[0]
            g_item_list = [prev_item]
            cur_group_index = 0
            space = ' '
            revised_tag = []
            for cur_item, cur_tag in tagged_input[1:]:
                cur_item = cur_item.lower()
                if prev_tag is cur_tag:
                    g_item_list[cur_group_index] += space + cur_item
                else:
                    revised_tag.append((g_item_list[cur_group_index], prev_tag))
                    prev_tag = cur_tag
                    g_item_list.append(cur_item)
                    cur_group_index += 1
            revised_tag.append((g_item_list[cur_group_index], prev_tag))
            return revised_tag

In [33]:
import pickle
load_word_vectors()

Successfully Loaded the model


In [37]:
def get_phrase_vectors(df, emotion):
    tweet_vectors_obj = None
    tweet_vectors = None
    labels = None
    filename = os.path.join(dir_name, '..', 'resources', 'raw_phrase_vectors_obj_'+emotion)
    if not os.path.exists(filename):
        tweet_vectors_obj = np.vectorize(get_phrase_vector_obj)(df[1].values)
        tweet_vectors = np.array([[]])
        for each_vector in tweet_vectors_obj:
            curr_vector = each_vector.vector
            if np.isnan(curr_vector).any():
                curr_vector = np.zeros(shape=(1, 300))
            else:
                curr_vector = curr_vector.reshape(1, len(each_vector.vector))
            if np.min(tweet_vectors.shape) == 0:
                tweet_vectors = np.concatenate((tweet_vectors, curr_vector), axis=1)
            else:
                tweet_vectors = np.concatenate((tweet_vectors, curr_vector), axis=0)
        labels = df[3].values
        with open(filename, 'wb') as f:
            pickle.dump(tweet_vectors_obj, f)
            pickle.dump(tweet_vectors, f)
            pickle.dump(labels, f)
    else:
        with open(filename, 'rb') as f:
            tweet_vectors_obj = pickle.load(f)
            tweet_vectors = pickle.load(f)
            labels = pickle.load(f)
    return tweet_vectors, labels

In [38]:
tweet_vectors_anger, labels_anger = get_phrase_vectors(df_anger, emotion='anger')
tweet_vectors_joy, labels_joy = get_phrase_vectors(df_joy, emotion='joy')
tweet_vectors_sadness, labels_sadness = get_phrase_vectors(df_sadness, emotion='sadness')
tweet_vectors_fear, labels_fear = get_phrase_vectors(df_fear, emotion='fear')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [39]:
def get_polarity_and_subjectivity(df, emotion):
    polarity_list = []
    subjectivity_list = []
    filename = os.path.join(dir_name, '..', 'resources', 'polarity_and_subjectivity_'+emotion)
    if not os.path.exists(filename):
        polarity_list = np.array(list(map(lambda x: Sentence(x).polarity, df[1].values)))
        subjectivity_list = np.array(list(map(lambda x: Sentence(x).subjectivity, df[1].values)))
        with open(filename, 'wb') as f:
            pickle.dump(polarity_list, f)
            pickle.dump(subjectivity_list, f)
    else:
        with open(filename, 'rb') as f:
            polarity_list = pickle.load(f)
            subjectivity_list = pickle.load(f)
    polarity_list = polarity_list.reshape(len(polarity_list),1)
    subjectivity_list = subjectivity_list.reshape(len(subjectivity_list),1)
    return polarity_list, subjectivity_list

In [40]:
polarity_list_anger, subjectivity_list_anger = get_polarity_and_subjectivity(df_anger, emotion='anger')
polarity_list_fear, subjectivity_list_fear = get_polarity_and_subjectivity(df_fear, emotion='fear')
polarity_list_sadness, subjectivity_list_sadness = get_polarity_and_subjectivity(df_sadness, emotion='sadness')
polarity_list_joy, subjectivity_list_joy = get_polarity_and_subjectivity(df_joy, emotion='joy')

array([[ 0.3       ],
       [ 0.9       ],
       [ 0.9       ],
       ..., 
       [ 0.6       ],
       [ 0.66666667],
       [ 0.53333333]])