In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
train_filename = 'data/data/5_train'
test_filename = 'data/data/5_test'
val_filename = 'data/data/5_validation'

In [None]:
def read_file(filename):
    tweet, emoji = list(), list()
    dummy_df = pd.DataFrame()
    with open(filename, 'r') as file:
        for line in file:
            t, e = line.strip('\n').split('\t')
            tweet.append(t)
            emoji.append(e)
    dummy_df['Tweet'] = tweet
    dummy_df['Emoji'] = emoji
    return dummy_df

df = read_file(train_filename)
df.head()

In [None]:
len(df.Emoji.unique()), df.shape

In [None]:
for i in range(5):
    tweet = df.Tweet[i]
    tokens = re.sub('[!\.#\'\",:?(){}<>;\@/|=]', '', tweet)
    tokens = re.sub('[_-]', ' ', tokens)
    tokens = re.sub('user', ' <user> ', tokens)
    tokens = re.sub('&+', 'and', tokens)
    tokens = re.sub('\d+', ' <num> ', tokens)
    tokens = tokens.split()
    print(tokens); print(tweet); print()

In [None]:
def tokenize(tweet):
    tokens = re.sub('[!\.#\'\",:?(){}<>;\@/|=$%^*]', '', tweet)
    tokens = re.sub('[_-]', ' ', tokens)
    tokens = re.sub('user', ' <user> ', tokens)
    tokens = re.sub('&+', 'and', tokens)
    tokens = re.sub('\d+', ' <num> ', tokens)
    tokens = tokens.split()
    return tokens

In [None]:
words = list()
for tweet in df.Tweet:
    for word in tokenize(tweet):
        words.append(word)

In [None]:
unique_words = list(set(words))
print(len(words), len(unique_words))

In [None]:
word_to_index = {k:v+1 for v, k in enumerate(unique_words)}
word_to_index['<unk>'] = 0

In [None]:
word_to_index['<unk>'], word_to_index['<num>'], word_to_index['<user>']

In [None]:
from nltk.probability import FreqDist
dist = FreqDist(words)
rare_words = [word for word in unique_words if dist[word] == 1]; len(rare_words)
print(rare_words[0:50])

In [None]:
def indexify(tweet):
    indexes = list()
    for word in tweet:
        try:
            indexes.append(word_to_index[word])
        except KeyError:
            indexes.append(0)
    return indexes

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df.Emoji.values)

df.Emoji = le.transform(df.Emoji.values)
df['indexed_tweets'] = df.Tweet.apply(lambda x: tokenize(x))
df['indexed_tweets'] = df.indexed_tweets.apply(lambda x: indexify(x))
df.head()

In [None]:
val_df = read_file(val_filename); val_df.head()

In [None]:
val_df.shape

In [None]:
val_df.Emoji = le.transform(val_df.Emoji.values)
val_df['indexed_tweets'] = val_df.Tweet.apply(lambda x: tokenize(x))
val_df['indexed_tweets'] = val_df.indexed_tweets.apply(lambda x: indexify(x))
val_df.head()

In [None]:
max_length = 35
embedding_vector_length = 100

In [None]:
x_train = df.indexed_tweets.values
x_val = val_df.indexed_tweets.values
y_train = df.Emoji.values
y_val = val_df.Emoji.values

In [None]:
from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical

x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_val = sequence.pad_sequences(x_val, maxlen=max_length)

y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

In [None]:
from keras.layers import GRU, Dropout, Embedding, Dense, Bidirectional, LSTM
from keras.models import Sequential
from keras.losses import categorical_crossentropy
from keras.metrics import categorical_accuracy

model = Sequential()
model.add(Embedding(len(unique_words)+1, embedding_vector_length, input_length=max_length))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

model.compile(loss=categorical_crossentropy, optimizer='adam', metrics=[categorical_accuracy])

model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=3, batch_size=512)

In [None]:
test_df = read_file(test_filename); test_df.head()

In [None]:
test_df.shape

In [None]:
test_df.Emoji = le.transform(test_df.Emoji.values)
test_df['indexed_tweets'] = test_df.Tweet.apply(lambda x: tokenize(x))
test_df['indexed_tweets'] = test_df.indexed_tweets.apply(lambda x: indexify(x))
test_df.head()

In [None]:
x_test = sequence.pad_sequences(test_df.indexed_tweets.values, maxlen=max_length)
x_test.shape

In [None]:
pred = np.argmax(model.predict(x_test), axis=1)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(test_df.Emoji.values, pred))

In [None]:
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
#confusion_matrix(test_df.Emoji.values, pred)
def plot_confusion_matrix(cm, #classes,
                          #normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(5)
    plt.xticks(tick_marks)
    plt.yticks(tick_marks)

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
cnf_matrix = confusion_matrix(test_df.Emoji.values, pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, #classes=class_names,
                      title='Confusion matrix, without normalization')

plt.show()

In [None]:
from sklearn.metrics import precision_recall_fscore_support
precision, recall, f1_score, support = precision_recall_fscore_support(test_df.Emoji.values, pred, average='weighted')
print("precision: ", precision)
print('recall: ', recall)
print('f1_score: ', f1_score)