Prepare dependencies and global variables

In [0]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import re
import sys
import pickle
import os

import numpy as np
import pandas as pd

from collections import defaultdict
from bs4 import BeautifulSoup

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, concatenate, LSTM, GRU, Bidirectional
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers


MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

TSV_DIR = "/content/drive/My Drive/Text Classification/Testground/data/imdb/labeledTrainData.tsv"
GLOVE_DIR = "/content/drive/My Drive/Text Classification/Testground/data/glove/"
TRAINED_MODEL_DIR = "/content/drive/My Drive/Text Classification/Testground/model/RNN/"

Prepare dependent functions

In [0]:
def get_train_data(dir):
  data_train = pd.read_csv(dir, sep='\t')
  return data_train

def get_pretrained_glove_vector(dir):
  embeddings_index = {}
  f = open(os.path.join(dir, 'glove.6B.100d.txt'))

  for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
  f.close()

  print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))
  return embeddings_index

# remove some unwanted characters
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(b'\\\\', b'', string)   
    string = re.sub(b"\'", b"", string)    
    string = re.sub(b"\"", b"", string)    
    return string.strip().lower()

def standardized_text(data_train):
  # parser train data using BeautifulSub and remove html tags
  text = BeautifulSoup(data_train.review[idx])
  unclean_str = text.get_text().encode('ascii','ignore')
  cleaned_str = clean_str(unclean_str)
  # decode cleaned string to bytes-like type
  return cleaned_str.decode('utf-8')

def create_simplified_cnn_network(filter, activate_function="relu"):
  l_cov1= Conv1D(128, 5, activation=activate_function)(filter)
  l_pool1 = MaxPooling1D(5)(l_cov1)
  l_cov2 = Conv1D(128, 5, activation=activate_function)(l_pool1)
  l_pool2 = MaxPooling1D(5)(l_cov2)
  l_cov3 = Conv1D(128, 5, activation=activate_function)(l_pool2)
  l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
  l_flat = Flatten()(l_pool3)
  l_dense = Dense(128, activation=activate_function)(l_flat)
  preds = Dense(2, activation='softmax')(l_dense)

  model = Model(sequence_input, preds)
  model.compile(loss='categorical_crossentropy',
                optimizer='rmsprop',
                metrics=['acc'])
  return model

def create_deeper_cnn_network(filter, activate_function="relu"):
  l_concatenate = concatenate(filter, axis=1)
  l_cov1= Conv1D(128, 5, activation=activate_function)(l_concatenate)
  l_pool1 = MaxPooling1D(5)(l_cov1)
  l_cov2 = Conv1D(128, 5, activation=activate_function)(l_pool1)
  l_pool2 = MaxPooling1D(30)(l_cov2)
  l_flat = Flatten()(l_pool2)
  l_dense = Dense(128, activation=activate_function)(l_flat)
  preds = Dense(2, activation='softmax')(l_dense)

  model = Model(sequence_input, preds)
  model.compile(loss='categorical_crossentropy',
                optimizer='rmsprop',
                metrics=['acc'])
  return model

def save_model(model, save_dir, name):
  if not os.path.exists(save_dir):
    os.makedirs(save_dir)
  model.save(save_dir + "/" + name)

def load_model(load_dir, model_name):
  return keras.models.load_model(load_dir + "/" + model_name)

# our embedding layer will be a matrix with a row for each word
# and a column for each element of the embedding.
# Therefore, we need to specify how many dimensions one embedding has.
# The version of Glove we loaded earlier has 100-dimensional vector.
def get_embedding_layer():
  embeddings_index = get_pretrained_glove_vector(GLOVE_DIR)
  embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

  for word, i in word_index.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
          embedding_matrix[i] = embedding_vector        

  # load pre-trained word embeddings into an Embedding layer
  # By setting trainable = True, the network would learn the embedding by itself.     
  embedding_layer = Embedding(len(word_index) + 1,
                              EMBEDDING_DIM,
                              weights=[embedding_matrix],
                              input_length=MAX_SEQUENCE_LENGTH,
                              trainable=True)
  return embedding_layer


Prepare data to train

In [0]:
data_train = get_train_data(TSV_DIR)
texts = []
labels = []

for idx in range(data_train.review.shape[0]):
  texts.append(standardized_text(data_train))
  labels.append(data_train.sentiment[idx])
    

# vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)


word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# split the data into a training set and a validation set
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

# The training data & label for each data set
# training_data  = x_train
# training_label = y_train
training_data = data[:-nb_validation_samples]
training_label = labels[:-nb_validation_samples]

# The validation data & label
# validation_data     = x_val
# validation_label    = y_val
validation_data = data[-nb_validation_samples:]
validation_label = labels[-nb_validation_samples:]

print('Training and validation set number of positive and negative reviews')
print (training_label.sum(axis=0))
print (validation_label.sum(axis=0))

# prepare embedding layer
embedding_layer = get_embedding_layer()

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

print("...Data is ready to train")

# **Train data using LSTM model**
The implemantation uses Bidirectional LSTM and concatenates both last output of LSTM outputs

In [0]:
lstm_layer = Bidirectional(LSTM(100))(embedded_sequences)
preds = Dense(2, activation='softmax')(lstm_layer)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("model fitting - Bidirectional LSTM")
model.summary()
model.fit(training_data, training_label, validation_data=(validation_data, validation_label),
          epochs=10, batch_size=50)

save_model(model, TRAINED_MODEL_DIR, "LSTM_model.h5")
print("...Trained data model is saved")

# **Train data using Attention Network**



```
This model can only strictly run on Theano backend since tensorflow matrix dot product doesn’t behave the same as np.dot.

Make sure to change your backend to Theano first
```



Run this line of code first to check your Keras's Backend

If your backend is Tensorflow, please Choose "Factory reset Runtime" in "Runtime" control tab

If your backend is Theano, keep going!

In [0]:
os.environ['KERAS_BACKEND']='theano'

from keras import __version__
from keras import backend as K

print('Using Keras version:', __version__, 'backend:', K.backend())

Using Keras version: 2.3.1 backend: tensorflow


Build a custom Keras layer which would be used in Attention Network

In [0]:


# Attention GRU network		  

class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(** kwargs)

    def build(self, input_shape):
      assert len(input_shape)==3
      self.W = self.add_weight(name='kernel', 
                                    shape=(input_shape[-1],),
                                    initializer='normal',
                                    trainable=True)
      super(AttLayer, self).build(input_shape)  

    # This build function doesn't work
    # def build(self, input_shape):
    #     assert len(input_shape)==3
    #     #self.W = self.init((input_shape[-1],1))
    #     self.W = self.init((input_shape[-1],))
    #     #self.input_spec = [InputSpec(shape=input_shape)]
    #     self.trainable_weights = [self.W]
    #     super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    # This call function doesn't work
    # Since The attention layer supports only Theano at the moment, not Tensorflow

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))

        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')

        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    # # Replace call function
    # def call(self, x, mask=None):
    #   eij = K.tanh(K.squeeze(K.dot(x, K.expand_dims(self.W)), axis=-1))
      
    #   ai = K.exp(eij)
    #   weights = ai/K.expand_dims(K.sum(ai, axis=1),1)
      
    #   weighted_input = x*K.expand_dims(weights,2)
    #   return K.sum(weighted_input, axis=1)
        
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

Train model using Attention GRU model

In [0]:
gru_layer = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
# add an attention layer on top of GRU Output
att_layer = AttLayer()(gru_layer)

preds = Dense(2, activation='softmax')(att_layer)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("model fitting - attention GRU network")
model.summary()
model.fit(training_data, training_label, validation_data=(validation_data, validation_label),
          epochs=10, batch_size=50)

save_model(model, TRAINED_MODEL_DIR, "GRU_model.h5")
print("...Trained data model is saved")