# Remark
There is some issue when using Tensorflow Hub in Keras
https://github.com/tensorflow/hub/issues/13

# Data Ingestion

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

train_raw_df = fetch_20newsgroups(subset='train', categories=categories)
test_raw_df = fetch_20newsgroups(subset='test', categories=categories)

x_train, x_val, y_train, y_val = train_test_split(np.array(train_raw_df.data), train_raw_df.target, test_size=0.1)
x_test = np.array(test_raw_df.data)
y_test = test_raw_df.target

# x_train = [x_train[:200] for x in x_train]

print('Train:', len(x_train))
print('Val:', len(x_val))
print('Test:', len(x_test))

Train: 2031
Val: 226
Test: 1502


# Embeddings

In [2]:
%reload_ext  autoreload
%autoreload 2

In [3]:
tfhub_dir = '/data/jupyter/common/model/text/tfhub'

In [4]:
import sys, os
def add_aion(curr_path=None):
    if curr_path is None:
        dir_path = os.getcwd()
        target_path = os.path.dirname(dir_path)
        if target_path not in sys.path:
            print('Added %s into sys.path.' % (target_path))
            sys.path.insert(0, target_path)
            
add_aion()

Added /data/jupyter/common into sys.path.


# Model

In [5]:
vocab = set()
for sentence in x_train:
    tokens = sentence.split(' ')
    for token in tokens:
        vocab.add(token)

vocab_size = len(vocab)
print('Vocab Size: %d' % (vocab_size))

Vocab Size: 105599


In [6]:
max_sentence_length = 80
word2Idx = {'<padding>': 0, '<unknown>': 1}
idx2word = {0: '<padding>', 1: '<unknown>'}

def preprocess(text, word2Idx, idx2word, training=False):
    if training:
        for sentence in text:
            tokens = sentence.split(' ')

            for token in tokens:
                if token not in word2Idx:
                    word2Idx[token] = len(word2Idx)
                    idx2word[len(word2Idx)-1] = token


    word_vectors = np.zeros((len(text), max_sentence_length))
    sentence_vectors = []
    
    for i, sentence in enumerate(text):
        ids = []
        words = []
        tokens = sentence.split(' ')
        for token in tokens:
            if token in word2Idx:
                ids.append(word2Idx[token])
                words.append(token)
            else:
                ids.append(word2Idx['<unknown>'])
                words.append('<unknown>')
                
            if len(ids) >= max_sentence_length:
                break

        for i in range(max_sentence_length - len(ids)):
            ids.append(word2Idx['<padding>'])
            words.append('<padding>')

        word_vectors[i] = np.asarray(ids)
        sentence_vectors.append(' '.join(words))

    sentence_vectors = np.asarray(sentence_vectors)
    
    return word2Idx, idx2word, word_vectors, sentence_vectors

word2Idx, idx2word, x_train_words, x_train_sentences = preprocess(
    text=x_train, word2Idx=word2Idx, idx2word=idx2word, training=True)
print('x_train_words.shape:', x_train_words.shape)

word2Idx, idx2word, x_test_words, x_test_sentences = preprocess(
    text=x_test, word2Idx=word2Idx, idx2word=idx2word, training=False)
print('x_test_words.shape:', x_test_words.shape)

x_train_words.shape: (2031, 80)
x_test_words.shape: (1502, 80)


# Weighted sum of the 3 layers with word embeddings

In [7]:
from aion.embeddings.elmo import ELMoEmbeddings

import tensorflow as tf
from keras import backend as K
from keras.layers import Input, Lambda, Dense, Embedding, BatchNormalization, Concatenate, LSTM
from keras.models import Model

elmo_embs = ELMoEmbeddings(layer='elmo', verbose=20)
elmo_embs.load(dest_dir=tfhub_dir)

Using TensorFlow backend.


2018-10-08 16:47:42.166800. [LOADING] file
2018-10-08 16:47:42.965872. [LOADED] 


<tensorflow_hub.module.Module at 0x7fa4062d9128>

In [8]:
# Input Layers
word_input_layer = Input(shape=(None, ), dtype='int32')
elmo_input_layer = Input(shape=(None, ), dtype=tf.string)

# Output Layers
word_output_layer = Embedding(
    input_dim=vocab_size, output_dim=256)(word_input_layer)
elmo_output_layer = Lambda(
    elmo_embs.to_keras_layer, 
    output_shape=(None, 1024))(elmo_input_layer)
output_layer = Concatenate()(
    [word_output_layer, elmo_output_layer])
output_layer = BatchNormalization()(output_layer)
output_layer = LSTM(
    256, dropout=0.2, recurrent_dropout=0.2)(output_layer)
output_layer = Dense(4, activation='sigmoid')(output_layer)

# Build Model
model = Model(
    inputs=[word_input_layer, elmo_input_layer], 
    outputs=output_layer)
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(
    [x_train_words, x_train_sentences], y_train,
#     validation_data=([x_test_words, x_test_sentences], y_test), 
    epochs=10, batch_size=32)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    27033344    input_1[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, None, 1024)   0           input_2[0][0]                    
__________________________________________________________________________________________________
concatenat

<keras.callbacks.History at 0x7fa3e830b780>

In [9]:
y_pred = model.predict([x_test_words, x_test_sentences])
y_pred = np.argmax(y_pred, axis=1)

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy:77.10%
Classification Report:
             precision    recall  f1-score   support

          0       0.72      0.45      0.55       319
          1       0.87      0.85      0.86       389
          2       0.88      0.83      0.85       396
          3       0.65      0.89      0.75       398

avg / total       0.78      0.77      0.76      1502



# Weighted sum of the 3 layers without word embeddings

In [11]:
from aion.embeddings.elmo import ELMoEmbeddings

import tensorflow as tf
from keras import backend as K
from keras.layers import Input, Lambda, Dense, Embedding, BatchNormalization, Concatenate, LSTM
from keras.models import Model

elmo_embs = ELMoEmbeddings(layer='elmo', verbose=20)
elmo_embs.load(dest_dir=tfhub_dir)

2018-10-08 17:32:55.478302. [LOADING] file
2018-10-08 17:32:56.379230. [LOADED] 


<tensorflow_hub.module.Module at 0x7fa404373d30>

In [12]:
# Input Layers
elmo_input_layer = Input(shape=(None, ), dtype=tf.string)

# Output Layers
output_layer = Lambda(
    elmo_embs.to_keras_layer, 
    output_shape=(None, 1024))(elmo_input_layer)
output_layer = BatchNormalization()(output_layer)
output_layer = LSTM(
    256, dropout=0.2, recurrent_dropout=0.2)(output_layer)
output_layer = Dense(4, activation='sigmoid')(output_layer)

# Build Model
model = Model(
    inputs=elmo_input_layer, outputs=output_layer)
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(
    x_train_sentences, y_train,
#     validation_data=([x_test_words, x_test_sentences], y_test), 
    epochs=10, batch_size=32)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, None)              0         
_________________________________________________________________
lambda_2 (Lambda)            (None, None, 1024)        0         
_________________________________________________________________
batch_normalization_2 (Batch (None, None, 1024)        4096      
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               1311744   
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 1028      
Total params: 1,316,868
Trainable params: 1,314,820
Non-trainable params: 2,048
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa17bff1940>

In [14]:
y_pred = model.predict(x_test_sentences)
y_pred = np.argmax(y_pred, axis=1)

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy:76.83%
Classification Report:
             precision    recall  f1-score   support

          0       0.62      0.59      0.60       319
          1       0.88      0.84      0.86       389
          2       0.92      0.80      0.86       396
          3       0.66      0.81      0.73       398

avg / total       0.78      0.77      0.77      1502



# Fixed Mean-pooling without word embeddings

In [16]:
from aion.embeddings.elmo import ELMoEmbeddings

import tensorflow as tf
from keras import backend as K
from keras.layers import Input, Lambda, Dense, Embedding, BatchNormalization, Concatenate, LSTM
from keras.models import Model

elmo_embs = ELMoEmbeddings(layer='default', verbose=20)
elmo_embs.load(dest_dir=tfhub_dir)

2018-10-08 18:25:28.769929. [LOADING] file
2018-10-08 18:25:29.650498. [LOADED] 


<tensorflow_hub.module.Module at 0x7fa17b5992e8>

In [17]:
# Input Layers
input_layer = Input(shape=(None,), dtype=tf.string)

# Output Layers
output_layer = Lambda(
    elmo_embs.to_keras_layer, 
    output_shape=(1024,))(input_layer)
output_layer = Dense(
    256, activation='relu')(output_layer)
output_layer = Dense(4, activation='sigmoid')(output_layer)

model = Model(inputs=[input_layer], outputs=output_layer)
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(
    x_train_sentences, y_train,
#     validation_data=([x_test_words, x_test_sentences], y_test), 
    epochs=10, batch_size=32)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, None)              0         
_________________________________________________________________
lambda_3 (Lambda)            (None, 1024)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               262400    
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 1028      
Total params: 263,428
Trainable params: 263,428
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa2e9468fd0>

In [18]:
y_pred = model.predict(x_test_sentences)
y_pred = np.argmax(y_pred, axis=1)

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy:74.37%
Classification Report:
             precision    recall  f1-score   support

          0       0.71      0.66      0.68       319
          1       0.95      0.66      0.78       389
          2       0.92      0.74      0.83       396
          3       0.57      0.89      0.70       398

avg / total       0.79      0.74      0.75      1502

