In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
NUMBER_OF_TWEETS = "ALL"
# NUMBER_OF_TWEETS = 100000

MAX_WORDS_PER_TWEET = 30
# DATA_LOCATION = "./train/data/noise/"
DATA_LOCATION = "./train/data/clean/"
# DATA_LOCATION = "./train/data/words/"
RESULT_LOCATION = "./result/"
TWEET_FILE_NAME = "tweet_by_ID_28_4_2018__03_20_05" + "_"

if NUMBER_OF_TWEETS is not None:
    TWEET_FILE_NAME += str(NUMBER_OF_TWEETS)
else:
    TWEET_FILE_NAME += "ALL"

In [3]:
base_file_name = DATA_LOCATION + TWEET_FILE_NAME

text_lines = []
text_lines_split = []

with open(base_file_name + ".text", 'r', encoding="utf-8") as out_text:
    for line in out_text:
        text_lines.append(line[:-1])
        text_lines_split.append(line[:-1].split())
        
loc_lines = []
with open(base_file_name + ".loclabels", 'r') as loc_labels:
    for line in loc_labels:
        loc_line = []
        for c in line[:-1]:
            loc_line.append(int(c))
        loc_lines.append(loc_line)

loc_lines = np.asarray(loc_lines)

emo_lines = []
with open(base_file_name + ".emolabels", 'r') as emo_labels:
    for e_line, loc in zip(emo_labels, loc_lines):
        emo_line = [0]*31
        e_line2 = e_line.split()
        
        br = 0
        for idx, val in enumerate(loc[:-1]):
            if val==1:
                emo_line[idx]=int(e_line2[br])+1
                br += 1
        emo_lines.append(emo_line)
        
emo_lines = np.asarray(emo_lines)

print(f"number of tweets {len(text_lines)}")
# print(f"\nexample of tweet texts:")
# for i in range(10):
#     print(f"{i}\t{text_lines[i]}")
# print(f"\nexample of labels (emoji locations):\n{loc_lines[:10]}")
# print(f"\nexample of labels (emoji type):\n{emo_lines[:10]}")

number of tweets 473459


In [4]:
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, KFold

np.set_printoptions(threshold=np.inf)
EVALUATION_RESULTS_KEYS = ['Accuracy: ', 'Precision: ', 'Recall: ', 'F1: ', 'ratio of positive/negative predictions: ']

kf = KFold(n_splits=5, shuffle=True)

def vectorize_func(a, b):
    if a == b:
        return 1
    else:
        return 0

def calc_scores(y_pred, y_test, multi_class):
    result = []
    
    if multi_class:
        result_temp = []
    
        for i in range(0, 21):
            vfunc = np.vectorize(vectorize_func)
            y_test_new = vfunc(y_test, i)
            y_pred_new = vfunc(y_pred, i)
            
            result_temp2 = []
            alfa=1
            if i == 0:
                alfa=1/5
            
            result_temp2.append(alfa*accuracy_score(y_pred_new, y_test_new))
            result_temp2.append(alfa*precision_score(y_pred_new, y_test_new))
            result_temp2.append(alfa*recall_score(y_pred_new, y_test_new))
            result_temp2.append(alfa*f1_score(y_pred_new, y_test_new))
            result_temp.append(result_temp2)
    
        result_avg = np.average(np.asarray(result_temp), axis=0)
        result = result_avg[:]
    else:
        result.append(accuracy_score(y_pred, y_test))
        result.append(precision_score(y_pred, y_test))
        result.append(recall_score(y_pred, y_test))
        result.append(f1_score(y_pred, y_test))
        result.append(np.count_nonzero(y_pred) / y_pred.shape[0])
    return result

def print_scores(scores):
    print(f"Accuracy: {scores[0]}")
    print(f"Precision: {scores[1]}")
    print(f"Recall: {scores[2]}")
    print(f"F1: {scores[3]}")
    if len(scores) > 4:
        print(f"ratio of positive/negative predictions {scores[4]}")

In [5]:
from pandas import DataFrame

GLOVE_DIR = "./embeddings/"
GLOVE_FILE_NAME = "glove.twitter.27B."
GLOVE_FILE_NAME_EXT = "d.txt"

MODEL_DIR = "./models/blstm_models/"
BLSTM_BASE_FILE_NAME = "blstm_model_"
BLSTM_FILE_NAME_EXT = ".h5"

N_TIMESTEPS = MAX_WORDS_PER_TWEET + 1
NUM_EMOJI_TYPES = 20
TEST_SPLIT_SIZE = 0.2
VALIDATION_SPLIT = 0.1
EARLY_STOPPING_PATIENCE = 2
MAX_EPOCH = 30

TWEET_NUM = len(text_lines)
# TWEET_NUM = 100000
INPUT_SIZE = int(TWEET_NUM * (1 - TEST_SPLIT_SIZE))
NUM_OF_VOCAB = None
EMBEDDING_SIZE = 200
HIDDEN_SIZE = 1000

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split

X_input = text_lines[:TWEET_NUM]
y_input = loc_lines[:TWEET_NUM]
y_emo_input = emo_lines[:TWEET_NUM]

abc = 100000

# X_blstm_train, y_blstm_train, y_blstm_emo_train = X_input[:abc], y_input[:abc], y_emo_input[:abc]
X_blstm_train, y_blstm_train, y_blstm_emo_train = X_input[:INPUT_SIZE], y_input[:INPUT_SIZE], y_emo_input[:INPUT_SIZE]
X_blstm_test, y_blstm_test, y_blstm_emo_test = X_input[INPUT_SIZE:], y_input[INPUT_SIZE:], y_emo_input[INPUT_SIZE:]

# X_blstm_train, X_blstm_test, y_blstm_train, y_blstm_test = train_test_split(X_input, y_input, test_size=TEST_SPLIT_SIZE)

if NUM_OF_VOCAB is not None:
    tokenizer = Tokenizer(num_words=NUM_OF_VOCAB)
else:
    tokenizer = Tokenizer()
    
tokenizer.fit_on_texts(X_input)
word_index = tokenizer.word_index
txt_to_seq = tokenizer.texts_to_sequences(X_blstm_train)
# print(f"encoded:\n{txt_to_seq[0:5]}\n")

if NUM_OF_VOCAB is not None:
    vocab_size = NUM_OF_VOCAB + 1
else:
    vocab_size = len(word_index) + 1

print(f"Vocabulary Size: {vocab_size}")

X_blstm = pad_sequences(txt_to_seq, maxlen=N_TIMESTEPS - 1, padding='post')
start_padding = np.zeros((X_blstm.shape[0], 1))
X_blstm = np.append(start_padding, X_blstm, axis=1).astype(int)
print(f"input shape: {X_blstm.shape}")
# print(f"BLSTM input example:\n{X_blstm[:5]}\n")

y_loc = y_blstm_train
y_blstm = to_categorical(y_loc, num_classes=2)
print(f"location labels shape: {y_blstm.shape}")
# print(f"\nBLSTM loc labels:\n{y_loc[:5]}\n")

y_emo = y_blstm_emo_train
y_emo_blstm = to_categorical(y_emo, num_classes=NUM_EMOJI_TYPES + 1)
print(f"emo labels shape: {y_emo_blstm.shape}")
# print(f"\nBLSTM emo labels:\n{y_emo[:5]}\n")

def calc_sample_weights(y):
#     print(np.unique(y))
    weights = class_weight.compute_class_weight('balanced', np.unique(y), y.flatten())
    class_weight_dict = dict(enumerate(weights))
#     print(f"class weight dict:\n{class_weight_dict}\n")
    vfunc = np.vectorize(lambda x: class_weight_dict[x])
    return vfunc(y)

sample_weights_loc = calc_sample_weights(y_loc)
# print(f"sample_weights_loc shape: {sample_weights_loc.shape}")
# print(f"sample_weights_loc examples:\n{sample_weights_loc[:5]}\n")

sample_weights_emo = calc_sample_weights(y_emo)
# print(f"sample_weights_emo shape: {sample_weights_emo.shape}")
# print(f"sample_weights_emo examples:\n{sample_weights_emo[:5]}\n")

  from ._conv import register_converters as _register_converters
Using Theano backend.


Vocabulary Size: 137097
input shape: (378767, 31)
location labels shape: (378767, 31, 2)
emo labels shape: (378767, 31, 21)


In [7]:
import os

def load_embedding_matrix(glove_size):
    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, GLOVE_FILE_NAME + str(glove_size) + GLOVE_FILE_NAME_EXT), encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    embedding_matrix = np.zeros((vocab_size, glove_size))
    for word, i in word_index.items():
        if i >= vocab_size - 1:
            break
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i + 1] = embedding_vector

    print(f"embedding matrix shape {embedding_matrix.shape}")
    
    return embedding_matrix

gloves_dict = {}

def get_embedding_matrix(glove_size):
    global gloves_dict
    
    if gloves_dict[glove_size] is None:
        gloves_dict[glove_size] = load_embedding_matrix(glove_size)
        
    return gloves_dict[glove_size]

In [8]:
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

def get_param_str(embedding_size, hidden_size, n_classes, input_size):
    param_str = "CLA-" + str(n_classes) + "_INP-" + str(input_size) + "_EMB-" + str(embedding_size) + "_HID-" + str(hidden_size)
    return param_str

def get_blstm_file_name(embedding_size, hidden_size, n_classes, input_size):
    param_str = get_param_str(embedding_size, hidden_size, n_classes, input_size)
    return MODEL_DIR + BLSTM_BASE_FILE_NAME + param_str + BLSTM_FILE_NAME_EXT

def save_blstm(blstm, embedding_size, hidden_size, n_classes=2, input_size=TWEET_NUM):
    blstm.save(get_blstm_file_name(embedding_size, hidden_size, n_classes, input_size))

def load_blstm(embedding_size, hidden_size, n_classes=2, input_size=TWEET_NUM):
    return load_model(get_blstm_file_name(embedding_size, hidden_size, n_classes, input_size))

def get_callbacks():
    callbacks = []
    
    callbacks.append(EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=EARLY_STOPPING_PATIENCE,
                              verbose=0, mode='auto'))
    checkpoint_path = MODEL_DIR + BLSTM_BASE_FILE_NAME + "check" + BLSTM_FILE_NAME_EXT
    callbacks.append(ModelCheckpoint(filepath=checkpoint_path, save_best_only=True))
    
    return callbacks

In [9]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.layers import Dropout

def get_bi_lstm_model(embedding_size=EMBEDDING_SIZE, hidden_size=HIDDEN_SIZE, n_classes=2, use_glove=True,
                      n_timesteps=N_TIMESTEPS, mode="concat"):
    model = Sequential()
    
    if use_glove:
        embedding_matrix = get_embedding_matrix(embedding_size)
        model.add(Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            input_length=N_TIMESTEPS,
                            trainable=True))
    else:
        model.add(Embedding(vocab_size_size,
                            embedding_size,
                            input_length=N_TIMESTEPS))
    
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(hidden_size, return_sequences=True), merge_mode=mode))
    model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"], sample_weight_mode="temporal")
    return model

def get_predictions(blstm, texts):
    text_to_integer_sequences = tokenizer.texts_to_sequences(texts)
    blstm_input = pad_sequences(text_to_integer_sequences, maxlen=N_TIMESTEPS, padding='post')
    ypred = blstm.predict_classes(blstm_input)
    return ypred

def print_blstm_score(y_pred, y_true):
    print_scores(calc_scores(y_pred.flatten(), y_true.flatten(), np.max(y_true)>1))

def evaluate_blstm(blstm, texts=X_blstm_test, labels=y_blstm_test):
    predictions = get_predictions(blstm, texts)
    print(predictions.shape)
    print(labels.shape)
    print(predictions[:10])
    print(labels[:10])
    print_scores(calc_scores(predictions.flatten(), labels.flatten(), np.max(labels)>1))

def do_blstm_test(embedding_size, hidden_size, is_emo=False, use_glove=True, input_size=TWEET_NUM):
    global blstm
    
    if is_emo:
        y_train = y_emo_blstm
        y_test = y_blstm_emo_test
        sample_weights = sample_weights_emo
    else:
        y_train = y_blstm
        y_test = y_blstm_test
        sample_weights = sample_weights_loc
        
    n_classes = y_train.shape[2]
    blstm = get_bi_lstm_model(embedding_size, hidden_size, n_classes, use_glove=use_glove)
    print(f"\nTraining with embedding {embedding_size} hidden {hidden_size} classes {n_classes} input {input_size} glove {use_glove}")    
    blstm.fit(X_blstm, y_train, epochs=MAX_EPOCH,
              validation_split=VALIDATION_SPLIT, verbose=2,
              sample_weight=sample_weights, callbacks=get_callbacks())
    save_blstm(blstm, embedding_size, hidden_size, n_classes, input_size)
    try:
        evaluate_blstm(blstm, labels=y_test)
    except Exception:
        print("couldn't evaluate")
        
def contract_intervals(y):
    y = y.copy()
    for i in range(y.shape[0]):
        start = 0
        is_interval = False
        for j in range(y.shape[1]):
            if y[i, j] == 1:
                if not is_interval:
                    start = j
                    is_interval = True
                y[i, j] = 0
            else:
                if is_interval:
                    is_interval = False
                    middle = int(math.ceil((start + j) / 2))
                    y[i, middle] = 1
    return y

def get_final_predictions(blstm_loc, blstm_emo, texts):
    loc_pred = get_predictions(blstm_loc, texts)
    loc_pred = contract_intervals(loc_pred)
    emo_pred = get_predictions(blstm_emo, texts)
    final_pred = np.array([y if x > 0 else 0 for x, y in zip(loc_pred.flatten(), emo_pred.flatten())]).reshape(loc_pred.shape[0], loc_pred.shape[1])
    return final_pred

In [10]:
# -*- coding: UTF-8 -*-

# 1  ❤
# 2  😍
# 3  😂
# 4  💕
# 5  🔥
# 6  😊
# 7  😎
# 8  ✨
# 9  💙
# 10 😘
# 11 📷
# 12 🇺🇸
# 13 ☀
# 14 💜
# 15 😉
# 16 💯
# 17 😁
# 18 🎄
# 19 📸
# 20 😜

emojis = ['\u2764', '\U0001f60d', '\U0001f602', '\U0001f495', '\U0001f525', '\U0001f60a', '\U0001f60e', '\u2728', '\U0001f499', '\U0001f618', '\U0001f4f7', '\U0001f1fa\U0001f1f8', '\u2600', '\U0001f49c', '\U0001f609', '\U0001f4af', '\U0001f601', '\U0001f384', '\U0001f4f80', '\U0001f61c']

def build_tweet(tweet, emoji_labels):
    tweet = tweet.split()
    result = ""
    
    for i in range(len(tweet) + 1):
        if emoji_labels[i] > 0:
            result += emojis[emoji_labels[i] - 1] + ' '
        if i < len(tweet):
            result += tweet[i] + ' '
            
    return result

def display_tweet_results(blstm_loc, blstm_emo, tweets):
    for tweet in tweets:
        clean_tweet = process_tweet(tweet)
        print(f"\nProcessed tweet:\n{clean_tweet}")
        emoji_label_predictions = get_final_predictions(blstm_loc, blstm_emo, [clean_tweet])[0]
        print(f"\nResult:\n{build_tweet(tweet, emoji_label_predictions)}\n")

In [11]:
blstm_loc = load_blstm(50, 100, 2)
blstm_emo = load_blstm(50, 100, 21)

# blstm_loc = load_blstm(200, 500, 2)
# blstm_emo = load_blstm(200, 500, 21)

In [19]:
offset = 0
num_of_tweets_to_test = 10

s = slice(offset, (offset + num_of_tweets_to_test))
y_true = y_blstm_emo_test[s]
y_pred = get_final_predictions(blstm_loc, blstm_emo, X_blstm_test[s])

# print_blstm_scores(y_pred, y_true)

for i in range(num_of_tweets_to_test):
    tweet = X_blstm_test[offset + i]
    print(build_tweet(tweet, y_true[i]))
    print(build_tweet(tweet, y_pred[i]))
    print()

s'mores baby ! 😍 @ ocean beach , san francisco 
s'mores baby ! 💙 @ ocean beach , san francisco 

❤ in the vip section somewhere @ hampton , virginia 
in the vip section 😎 somewhere @ hampton , virginia 

so much beauty in the city lights ✨ @user with @user @ urban lights - lacma 
so much beauty in the city lights @user with @user ✨ @ urban lights - lacma 

@user @user ily too ❤ 
@user @user ily too 😘 

my little mini me ! love my sweet little cousin 😊 
my little mini me ❤ ! love my sweet little cousin ❤ 

carolina blue living room inspiration ! go panthers ! 💙 hope everyone enjoys the game 
carolina blue living room inspiration ! go 💙 panthers ! hope everyone enjoys the game 💙 

lowkey feelin these ... 🔥 @ bloomington , minnesota 
lowkey feelin these 😂 ... @ bloomington , minnesota 

got to finally see the san diego zoo for the first time with my love 💕 had so much fun on our 
got to finally see the san diego zoo for the first time with my love 😊 had so much fun on our 

❤ thiskid @ ri

In [20]:
from .demo.process_tweet import process_tweet

tweets = ["This is the best day of my life!!! I love you guys!",
         "Merry Christmas everyone! I hope you all have a nice holiday.",
         "hahaha man you're so funny"]

display_tweet_results(blstm_loc, blstm_emo, tweets)


Processed tweet:
this is the best day of my life ! i love you guys !

Result:
This is the best day of my life!!! ❤ I love you guys! 


Processed tweet:
merry christmas everyone ! i hope you all have a nice holiday .

Result:
Merry Christmas everyone! 🎄 I hope you all have a nice holiday. 


Processed tweet:
hahaha man you're so funny

Result:
hahaha man 😂 you're so funny 



In [21]:
from .demo.process_tweet import process_tweet

print("Please input a tweet (text only):")
user_input = input()
display_tweet_results(blstm_loc, blstm_emo, [user_input])

Please input a tweet (text only):
make america great again

Processed tweet:
make america great again

Result:
make america 🇺🇸 great again 

