# Treating embeddings as seperate continuous features

The 2 embeddings used here are glove and fasttext. In the model they are passed to different embedding layers and 2 different layers of feature extraction are present for each of the embeddings.

Sequence feature is extracted by first spatially dropping out the embedding vectors. Than globally max and average fetaures are pooled from the selfattended RNN output.

The max pooled features for both the embeddings is than concatenated and passed to dense layer for feature extraction.

Same is done for the average pooled features and, than these pooled features are concatenated and passed to fully connected network for classification. 


In [1]:
import keras, tensorflow, sys
keras.__version__, tensorflow.__version__, sys.version

Using TensorFlow backend.


('2.2.2',
 '1.10.0',
 '3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 11:27:44) [MSC v.1900 64 bit (AMD64)]')

In [2]:
# import required packages
import sys
import warnings
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import keras
import tensorflow as tf

from keras.models import Model

from keras.layers import CuDNNLSTM, CuDNNGRU, BatchNormalization, Dense, Dropout, Activation, Embedding, Input
from keras.layers import Bidirectional,SpatialDropout1D, GlobalAveragePooling1D, GlobalMaxPooling1D, Concatenate

from keras.optimizers import Adam

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical

from keras_self_attention import  SeqSelfAttention

from sklearn.metrics import confusion_matrix,f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
from sklearn.cross_validation import train_test_split

import pandas as pd
import numpy as np
import re
from glob import glob

import math
from snapshot import SnapshotCallbackBuilder
import time



In [3]:
def load_imdb_dataset():

    # Load the dataset
    train = pd.DataFrame(columns=["text", "positive"])
    test = pd.DataFrame(columns=["text", "positive"])
    ctr = 0
    cte = 0
    for fil in ['train/', 'test/']:
        for cls in ['pos', 'neg']:
            dset_path = "./" + fil + cls
            for fname in sorted(os.listdir(dset_path)):
                if fname.endswith('.txt'):
                    with open(os.path.join(dset_path, fname), encoding="utf8") as f:
                        if fil == 'train/':
                            train.loc[ctr] = (f.read(), int(cls == "pos"))
                            ctr+=1
                        else:
                            test.loc[cte] = (f.read(), int(cls == "pos"))
                            cte+=1
                            
    return train, test

In [4]:
train, test = load_imdb_dataset()

print ("Train data shape", train.shape)
print ("Test data shape", test.shape)

Train data shape (25000, 2)
Test data shape (25000, 2)


In [5]:
print("Train data class distbn", train.positive.value_counts())
print("Test data class distbn", test.positive.value_counts())

Train data class distbn 1    12500
0    12500
Name: positive, dtype: int64
Test data class distbn 1    12500
0    12500
Name: positive, dtype: int64


In [6]:
train.head()

Unnamed: 0,text,positive
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [7]:
test.head()

Unnamed: 0,text,positive
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [8]:
# Average number of words per review 
tr_l = [len(x.split()) for x in train.text]
te_l = [len(x.split()) for x in test.text]
print("Train Sequence length distribution:\n")
print(pd.Series(tr_l).describe())
print("\n\nTest Sequence length distribution:\n")
print(pd.Series(te_l).describe())

Train Sequence length distribution:

count    25000.000000
mean       233.787200
std        173.733032
min         10.000000
25%        127.000000
50%        174.000000
75%        284.000000
max       2470.000000
dtype: float64


Test Sequence length distribution:

count    25000.000000
mean       228.526680
std        168.883693
min          4.000000
25%        126.000000
50%        172.000000
75%        277.000000
max       2278.000000
dtype: float64


In [9]:
# Number of unique words by finding the length of dictionary of words mapped with unique tokens (integers)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(train.text))
print("Vocab size", len(tokenizer.word_counts))

Vocab size 88582


In [10]:
embed_size = 300 

# mean number of words per sentence in the train set is taken as maximum sentence length.
max_sent_len = int(np.percentile(tr_l, 50)) 

num_words = len(tokenizer.word_counts)

In [11]:
# Converte sentence text to list of token represented sentences, required for training
X = tokenizer.texts_to_sequences(train.text)
X = pad_sequences(X, maxlen=max_sent_len)

x_test = tokenizer.texts_to_sequences(test.text)
x_test = pad_sequences(x_test, maxlen=max_sent_len)

In [12]:
# Split into train and validation data
x_train, x_val, y_train, y_val = train_test_split(X, train.positive, test_size=0.1, random_state=3)
x_train.shape, x_val.shape

((22500, 174), (2500, 174))

In [13]:
# Functions to load different embeddings

def load_glove(word_index):
    EMBEDDING_FILE = '../embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8"))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    
    nb_words = min(num_words, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= num_words: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 

def load_fasttext(word_index):    
    EMBEDDING_FILE = '../embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8") if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    nb_words = min(num_words, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= num_words: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [14]:
# Get word_indexes (tokens) for each of the word in vocabulary
word_index = tokenizer.word_index

embedding_matrix_glove = load_glove(word_index)
embedding_matrix_ft = load_fasttext(word_index)

In [15]:
# Build a snaphot of the model after (nb_epochs/ M) epochs. Also cosine anneal the learning rate.

M = 2
nb_epoch = T = 50
alpha_zero = 5e-4
snapshot = SnapshotCallbackBuilder(T, M, alpha_zero)
timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
model_prefix = './imdb{}'.format(timestr)

callbacks = snapshot.get_callbacks(model_prefix=model_prefix)


In [16]:
pred_avg = []
real = list(test.positive)
# Performing cross validation of 5
for cv in range(5):
    
    # Embedding layer to use glove embeddings
    embedding_layer_g = Embedding(num_words, embed_size, input_length=max_sent_len, trainable=False,
                                  weights=[embedding_matrix_glove])
    sequence_input_g = Input(shape=(max_sent_len,), dtype='int32')

    embedded_sequences_g = embedding_layer_g(sequence_input_g)
    embedded_sequences_g = SpatialDropout1D(0.2)(embedded_sequences_g)

    x_g = Bidirectional(CuDNNGRU(64, return_sequences=True), merge_mode='concat')(embedded_sequences_g)
    x_g_a = SeqSelfAttention()(x_g)

    x_g = Concatenate()([x_g, x_g_a])

    x_g_a = GlobalAveragePooling1D()(x_g)
    x_g = GlobalMaxPooling1D()(x_g)


    # Embedding layer to use fasttext embeddings
    embedding_layer_f = Embedding(num_words, embed_size, input_length=max_sent_len, trainable=False,
                                  weights=[embedding_matrix_ft])
    sequence_input_f = Input(shape=(max_sent_len,), dtype='int32')

    embedded_sequences_f = embedding_layer_f(sequence_input_f)
    embedded_sequences_f = SpatialDropout1D(0.2)(embedded_sequences_f)

    x_f = Bidirectional(CuDNNGRU(64, return_sequences=True), merge_mode='concat')(embedded_sequences_f)
    x_f_a = SeqSelfAttention()(x_f)

    x_f = Concatenate()([x_f, x_f_a])

    x_f_a = GlobalAveragePooling1D()(x_f)
    x_f = GlobalMaxPooling1D()(x_f)


    # Concatenate the globally pooled features from each of the embeddings
    x_g = Concatenate()([x_g, x_f])

    x_g = Dense(128, activation="relu", kernel_initializer="glorot_normal")(x_g)
    x_g = BatchNormalization()(x_g)
    x_g = Dropout(0.4)(x_g)

    
    # Concatenate the globally averaged features from each of the embeddings
    x_a = Concatenate()([x_g_a, x_f_a])

    x_a = Dense(128, activation="relu", kernel_initializer="glorot_normal")(x_a)
    x_a = BatchNormalization()(x_a)
    x_a = Dropout(0.4)(x_a)

    x = Concatenate()([x_g, x_a])
    
    
    # Fully connected layers to classify using features from both the embeddings.
    x = Dense(128, activation="relu", kernel_initializer="glorot_normal")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)

    x = Dense(64, activation="relu", kernel_initializer="glorot_normal")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)

    x = Dense(16, activation="relu", kernel_initializer="glorot_normal")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)

    out = Dense(1, activation="sigmoid", kernel_initializer="glorot_normal")(x)
    
    model = Model([sequence_input_g, sequence_input_f], out)

    model.compile(loss="binary_crossentropy", optimizer=Adam(5e-5),metrics=['accuracy'])
    model.fit([x_train, x_train], y_train, validation_data=([x_val, x_val], y_val), epochs=nb_epoch, verbose=0,
              batch_size=100, shuffle=True, callbacks=callbacks)
    pred = model.predict(x=[x_test, x_test])
    pred = pred > 0.5
    pred = [int(p[0]) for p in pred]
    pred_avg.append(pred)
    print("Model:", cv, ", Accuracy_score:", accuracy_score(real, pred))
    del model

pred = np.mean(pred_avg, axis=0)
pred = pred > 0.5
pred = [int(p) for p in pred]


Model: 0 , Accuracy_score: 0.88964
Model: 1 , Accuracy_score: 0.89272
Model: 2 , Accuracy_score: 0.8908
Model: 3 , Accuracy_score: 0.88996
Model: 4 , Accuracy_score: 0.89064


## F1 Score: 90.59 

In [17]:
print("Confusion Matrix:\n", confusion_matrix(real, pred))
print("f1_score:",f1_score(real, pred), "precision_score:",precision_score(real, pred),
          "recall_score:",recall_score(real, pred), "accuracy_score:",accuracy_score(real, pred))

Confusion Matrix:
 [[11306  1194]
 [ 1161 11339]]
f1_score: 0.9059241800822915 precision_score: 0.9047315088167238 recall_score: 0.90712 accuracy_score: 0.9058
