In [None]:
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
import keras
from keras.layers.merge import concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Bidirectional, Dropout, Input, MaxPooling1D, Reshape
from keras.layers import Dense, SimpleRNN, GRU, LSTM, Conv2D, MaxPooling2D, Flatten, Embedding, Conv1D, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping, History
from keras import optimizers
from keras.models import Model
import os
import re

In [None]:
def create_tweets_and_labels(file_name):
    f = open(file_name, encoding="utf8")
    isFirstLine = True
    tweets = []
    labels = []
    tweet_ids = []
    for line in f:
        if (isFirstLine):
            isFirstLine = False
            continue
        values = line.strip().split("\t")
        
        tweets.append(values[1])
        
        tweet_ids.append(values[0])
    
        label = []
        for i in range(2,13):
            try:
                label.append(int(values[i]))
            except:
                label.append(0)
        labels.append(label)
    
    labels = np.array(labels)
    
    return tweet_ids, tweets, labels

In [None]:
train_tweet_ids, train_tweets , train_labels = create_tweets_and_labels('./2018-E-c-En-train.txt')
dev_tweet_ids, dev_tweets, dev_labels = create_tweets_and_labels('./2018-E-c-En-dev.txt')

In [None]:
t = Tokenizer()
t.fit_on_texts(train_tweets)
vocab_size = len(t.word_index) + 1
vocab_size

In [None]:
# integer encode the documents
encoded_train_tweets = t.texts_to_sequences(train_tweets)
encoded_dev_tweets = t.texts_to_sequences(dev_tweets)

In [None]:
max_len = 0
for encoded_tweet in encoded_train_tweets:
    if len(encoded_tweet) > max_len:
        max_len = len(encoded_tweet)

max_len

In [None]:
# pad documents
padded_train_tweets = pad_sequences(encoded_train_tweets, maxlen=max_len, padding='post')
padded_dev_tweets = pad_sequences(encoded_dev_tweets, maxlen=max_len, padding='post')

x_train = padded_train_tweets
y_train = train_labels

x_dev = padded_dev_tweets
y_dev = dev_labels

In [None]:
EMBEDDING_DIM = 300
embedding_file = './glove/glove.840B.300d.txt'

# load the whole embedding into memory
embeddings_index = dict()
f = open(embedding_file, encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print(values)
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, EMBEDDING_DIM))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
x = Input(shape=(max_len,))
embedding_layer = Embedding(vocab_size, EMBEDDING_DIM,weights=[embedding_matrix]
                       ,input_shape=(max_len,),trainable=False,name='Embedding')(x)

g = 100
gru1_layer = Bidirectional(GRU(g, return_sequences=True), merge_mode='concat')(embedding_layer)

e = g*2
reshape_layer = Reshape((max_len , e , 1) , name='Reshape_Embedding')(gru1_layer)

one_gram_conv = Conv2D(200, kernel_size=(1, e), activation='relu' , use_bias=True
                       , name='1Gram_Conv')(reshape_layer)
one_gram_maxpool = MaxPooling2D(pool_size=(max_len - 1 + 1, 1) 
                                , name='1Gram_Maxpool')(one_gram_conv)
one_gram_flatten = Flatten(name='1Gram_TimeDistributed_Flatten')(one_gram_maxpool)

two_gram_conv = Conv2D(200, kernel_size=(2, e), activation='relu' , use_bias=True 
                                , name='2Gram_Conv')(reshape_layer)
two_gram_maxpool = MaxPooling2D(pool_size=(max_len - 2 + 1, 1) 
                                   , name='2Gram_Maxpool')(two_gram_conv)
two_gram_flatten = Flatten(name='2Gram_Flatten')(two_gram_maxpool)

three_gram_conv = Conv2D(200, kernel_size=(3, e), activation='relu' , use_bias=True 
                                  , name='3Gram_Conv')(reshape_layer)
three_gram_maxpool = MaxPooling2D(pool_size=(max_len - 3 + 1, 1) 
                                , name='3Gram_Maxpool')(three_gram_conv)
three_gram_flatten = Flatten(name='3Gram_Flatten')(three_gram_maxpool)

merge = concatenate([one_gram_flatten, two_gram_flatten , three_gram_flatten] , name='Merge_n-grams')

fully_connected = Dense(20,activation='relu',name='Fully_Connected_Layer')(merge)

output_layer = Dense(11, activation='sigmoid', name='Output_Layer')(fully_connected)

output = output_layer
model = Model(inputs=x , outputs=output)

#a = optimizers.Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
patience = 5
callbacks=[EarlyStopping(monitor='val_loss', patience=patience, verbose=1, restore_best_weights=True)]

history = model.fit(x_train,y_train,
                    validation_data=[x_dev,y_dev],
                    epochs=50, 
                    batch_size=100,
                    verbose=1
                    ,callbacks=callbacks
                    )

In [None]:
score = model.evaluate(x_dev, y_dev, verbose=0)
print('\nTest loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
dev_predic_probs = model.predict(padded_dev_tweets)
dev_predic_classes = zeros((padded_dev_tweets.shape[0],11),dtype=int)

In [None]:
threshold = 0.4
for i in range(dev_predic_probs.shape[0]):
    for j in range(11):
        if (dev_predic_probs[i][j] >= threshold):
            dev_predic_classes[i][j] = 1
        else:
            dev_predic_classes[i][j] = 0

In [None]:
import sklearn.metrics
sklearn.metrics.jaccard_similarity_score(dev_labels , dev_predic_classes)

In [None]:
# create dev predictions file
f = open("E-C_en_pred.txt", "r+", encoding="utf8")
f.write("ID\tTweet\tanger\tanticipation\tdisgust\tfear\tjoy\tlove\toptimism\tpessimism\tsadness\tsurprise\ttrust\n")

for i in range(dev_predic_classes.shape[0]):
    f.write(dev_tweet_ids[i] + "\t" + dev_tweets[i] + "\t") 
    for j in range(10):
        f.write((dev_predic_classes[i][j]).__str__() + "\t")
    f.write((dev_predic_classes[i][10]).__str__() + "\n")

f.close()

In [None]:
# create predictions for test set
test_tweet_ids, test_tweets, test_labels = create_tweets_and_labels('./2018-E-c-En-test.txt')

encoded_test_tweets = t.texts_to_sequences(test_tweets)

padded_test_tweets = pad_sequences(encoded_test_tweets, maxlen=max_len, padding='post')

test_predic_probs = model.predict(padded_test_tweets)
test_predic_classes = zeros((padded_test_tweets.shape[0],11),dtype=int)

threshold = 0.4
for i in range(test_predic_probs.shape[0]):
    for j in range(11):
        if (test_predic_probs[i][j] >= threshold):
            test_predic_classes[i][j] = 1
        else:
            test_predic_classes[i][j] = 0

In [None]:
# create test predictions file
f = open("E-C_en_pred.txt", "r+", encoding="utf8")
f.write("ID\tTweet\tanger\tanticipation\tdisgust\tfear\tjoy\tlove\toptimism\tpessimism\tsadness\tsurprise\ttrust\n")

for i in range(test_predic_classes.shape[0]):
    f.write(test_tweet_ids[i] + "\t" + test_tweets[i] + "\t") 
    for j in range(10):
        f.write((test_predic_classes[i][j]).__str__() + "\t")
    f.write((test_predic_classes[i][10]).__str__() + "\n")

f.close()