# Practical 6.0 : Bidirectional, CNN + RNN

# Character-level Sequence classification model

In [0]:
from __future__ import print_function

### IMDB user review data set

We will use character sequences of IMDB text reviews to predict whether the review is positive (class label=1) or negative (class label =0). Download data set from https://storage.googleapis.com/trl_data/imdb_dataset.zip. Run Practical 5.1 to preprocess data.

In [0]:
import os
import sys
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 100
import re
import nltk

DATA_PATH = 'data'
EMBEDDING_PATH = 'embedding'
MODEL_PATH = 'model'

## Read preprocessed data

In [0]:
# reading stored character-level vocabulary index

np_indices_char = np.load(os.path.join(DATA_PATH,'indices_char.npy'))

import collections

indices_char = collections.OrderedDict()
for i in range(len(np_indices_char.item())):
    index_val =  np_indices_char.item()[i]
    indices_char[i] = index_val
    
char_indices = dict((c, i) for i, c in (indices_char.items()))

In [0]:
X_train = np.load(os.path.join(DATA_PATH,'X_train_char.npy'))
y_train = np.load(os.path.join(DATA_PATH,'y_train_char.npy'))

X_valid = np.load(os.path.join(DATA_PATH,'X_valid_char.npy'))
y_valid = np.load(os.path.join(DATA_PATH,'y_valid_char.npy'))

In [0]:
# here we only use smaller set to train our model 
# original set consists of 25.000 reviews

X_train = X_train[:10000]
y_train = y_train[:10000]

X_valid = X_valid[:5000]
y_valid = y_valid[:5000]

## Character-level Recurrent Neural Networks (RNN) model

In [0]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout
from keras.layers import LSTM, Lambda, Bidirectional, concatenate
import tensorflow as tf
import keras.callbacks

In [0]:
num_chars = len(char_indices)
max_sequence_length = 100
rnn_dim = 32
batch_size = 128
epochs = 10

In [0]:
def binarize(x, sz=num_chars):
    return tf.to_float(tf.one_hot(x, sz, on_value=1, off_value=0, axis=-1))

In [0]:
def binarize_outshape(in_shape):
    return in_shape[0], in_shape[1], num_chars

## 1. Bidirectional LSTM model

To be included in model architecture:

* Input layer
* Lambda layer as projection layer for one hot encoding of character input
* Bidirectional LSTM
* Dense layer

#### Example-1: using Bidirectional layer

In [63]:
# construct architecture
char_input = # YOUR CODE HERE
onehot_layer = # YOUR CODE HERE
bilstm_layer = # YOUR CODE HERE
sentiment_prediction = # YOUR CODE HERE

# define and load model
bilstm_model = Model(inputs=char_input, outputs=sentiment_prediction)
bilstm_model.summary()

# compile model
bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train model
bilstm_model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=epochs)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   (None, 100)               0         
_________________________________________________________________
embedding_encoder (Lambda)   (None, 100, 71)           0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 64)                26624     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 26,689
Trainable params: 26,689
Non-trainable params: 0
_________________________________________________________________
Train on 10000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f756334df60>

#### Example-2: using concatenate layer 

To be included in model architecture:

* Input layer
* Lambda layer as projection layer for one hot encoding of character input
* Forwards LSTM
* Backwards LSTM
* Concatenation layer
* Dense layer

In [64]:
# construct architecture


char_input = # YOUR CODE HERE
onehot_layer = # YOUR CODE HERE
forwards = # YOUR CODE HERE
backwards = # YOUR CODE HERE
merged = # YOUR CODE HERE
sentiment_prediction = # YOUR CODE HERE

# define and load model
bilstm_model = Model(inputs=char_input, outputs=sentiment_prediction)
bilstm_model.summary()

# compile model
bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=epochs)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_encoder (Lambda)      (None, 100, 71)      0           encoder_input[0][0]              
__________________________________________________________________________________________________
lstm_8 (LSTM)                   (None, 32)           13312       embedding_encoder[0][0]          
__________________________________________________________________________________________________
lstm_9 (LSTM)                   (None, 32)           13312       embedding_encoder[0][0]          
__________________________________________________________________________________________________
concatenat

<keras.callbacks.History at 0x7f7561e2fd68>

## 2. CNN + RNN

### Preprocessing documents into splitted sentences

In [0]:
def striphtml(html):
    p = re.compile(r'<.*?>')
    return p.sub('', html)

def clean(s):
    return re.sub(r'[^\x00-\x7f]', r'', s)

data = pd.read_csv(os.path.join(local_download_path,"trainingData.tsv"), header=0, delimiter="\t")
valid_data = pd.read_csv(os.path.join(local_download_path,"validationData.tsv"), header=0, delimiter="\t")

docs_sents = []
docs_sents_y = []
for cont, sentiment in zip(data.review, data.sentiment):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', clean(striphtml(cont)))
    sentences = [sent.lower() for sent in sentences]
    docs_sents.append(sentences)
    docs_sents_y.append(sentiment)
    

val_docs_sents = []
val_docs_sents_y = []
for docid,cont in zip(valid_data.id, valid_data.review):
    
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', clean(striphtml(cont)))
    sentences = [sent.lower() for sent in sentences]
    val_docs_sents.append(sentences)
    
    id_label = docid.split('_')
    if(int(id_label[1]) >= 7):
        val_docs_sents_y.append(1)
    else:
        val_docs_sents_y.append(0)   

# reading stored character-level vocabulary index

np_indices_char = np.load(os.path.join(local_download_path,'indices_char.npy'))

import collections

indices_char = collections.OrderedDict()
for i in range(len(np_indices_char.item())):
    index_val =  np_indices_char.item()[i]
    indices_char[i] = index_val
    
char_indices = dict((c, i) for i, c in (indices_char.items()))

maxlen = 50 # maximum number of words in a sentence
max_sentences = 15 # maximum number of sentence in a document

X = np.zeros((len(docs_sents), max_sentences, maxlen), dtype=np.int32) 
y = np.array(docs_sents_y)

for i, doc in enumerate(docs_sents):
    for j, sentence in enumerate(doc):
        if j < max_sentences:
            len_sent = len(sentence) 
            if len_sent > maxlen:
                sent = sentence[:maxlen]
            else:
                sent = sentence
            
            for t, char in enumerate(sent):
                X[i, j, (maxlen - 1 - t)] = char_indices[char]

X_val = np.zeros((len(val_docs_sents), max_sentences, maxlen), dtype=np.int32) 
y_val = np.array(val_docs_sents_y)

for i, doc in enumerate(val_docs_sents):
    for j, sentence in enumerate(doc):
        if j < max_sentences:
            len_sent = len(sentence) 
            if len_sent > maxlen:
                sent = sentence[:maxlen]
            else:
                sent = sentence
            
            for t, char in enumerate(sent):
                X_val[i, j, (maxlen - 1 - t)] = char_indices[char]

## CNN + RNN Model

In [0]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Input, Dropout, MaxPooling1D, Conv1D, GlobalMaxPool1D
from keras.layers import LSTM, Lambda, Bidirectional, concatenate, BatchNormalization
from keras.layers import TimeDistributed

### Hierarchical input

In [0]:
# sentence input
in_sentence = Input(shape=(maxlen,), dtype='int32')

# document input
in_document = Input(shape=(max_sentences, maxlen), dtype='int32')

### Sentence encoder

#### One-hot projection layer

To be included:
* Lambda layer as projection layer for mapping input into one hot encoding
* Input for this lambda layer is `in_sentence`

In [0]:
char_embedded = # YOUR CODE HERE

#### Temporal Convolution layer

To be included in this convolutional layer (Notice that we have 3 different filter length: iterate)

* Conv1D layer: 
   - number filter : according to number filter for each filter length
   - kernel size : according to filter length
   - no padding
   - activation : RELU
   - kernel_initializer='glorot_normal'
   - strides=1
* Dropout layer
* MaxPooling1D layer:
   - pool_size: pool_length

In [0]:
encodes sentence by character sequences with CNN

filter_length = [7, 5, 3]
nb_filter = [64, 128, 256]
pool_length = 2

for i in range(len(nb_filter)):
    char_embedded = # YOUR CODE HERE (Conv1D layer)

    char_embedded = # YOUR CODE HERE (Dropout layer)
    char_embedded = # YOUR CODE HERE (MaxPooling1D layer)

To be included:
* Bidirectional LSTM/GRU layer

In [0]:
bilstm_sent = # YOUR CODE HERE

In [46]:
sent_encoder = Model(inputs=in_sentence, outputs=bilstm_sent)
sent_encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
lambda_3 (Lambda)            (None, 50, 71)            0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 44, 64)            31872     
_________________________________________________________________
dropout_11 (Dropout)         (None, 44, 64)            0         
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 22, 64)            0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 18, 128)           41088     
_________________________________________________________________
dropout_12 (Dropout)         (None, 18, 128)           0         
__________

### Document encoder

In [0]:
encoded = TimeDistributed(sent_encoder)(in_document)

### Document decoder

To be included:
* Bidirectional LSTM/GRU layer
* Optional : Dropouts and Dense layer
* Dense layer as prediction layer

In [49]:
# YOUR CODE HERE

model = Model(inputs=in_document, outputs=sentiment_prediction)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 15, 50)            0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 15, 256)           565760    
_________________________________________________________________
bidirectional_6 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout_14 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_15 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 129       
Total para

In [0]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
X_train = X[:10000]
y_train = y[:10000]

X_valid = X_val[:5000]
y_valid = y_val[:5000]

In [52]:
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=64, epochs=10)

Train on 10000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f75670a9780>

## References:

#### CNN for character sequences: 

[1] Zhang, Xiang, and Yann LeCun. "Text understanding from scratch." arXiv preprint arXiv:1502.01710 (2015). https://arxiv.org/pdf/1502.01710v5.pdf

[2] Kim, Yoon. "Convolutional neural networks for sentence classification." arXiv preprint arXiv:1408.5882 (2014). http://www.aclweb.org/anthology/D14-1181

[3] Conneau, Alexis, et al. "Very deep convolutional networks for text classification." Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers. Vol. 1. 2017. http://www.aclweb.org/anthology/E17-1104

#### CNN + LSTM for character sequences:

[1] Vosoughi, Soroush, Prashanth Vijayaraghavan, and Deb Roy. "Tweet2vec: Learning tweet embeddings using character-level cnn-lstm encoder-decoder." Proceedings of the 39th International ACM SIGIR conference on Research and Development in Information Retrieval. ACM, 2016. https://arxiv.org/pdf/1607.07514.pdf

[2] Kim, Yoon, et al. "Character-Aware Neural Language Models." AAAI. 2016. https://arxiv.org/pdf/1508.06615.pdf 