## Implementation of Highway Networks

This is based on the following paper: <i>Highway Networks</i>. An important implementation note is that when x and H(x,W_h) are not the same shape, a plain layer (without highways) is used to change dimensionality and then continue with the stacking of highway layers. Additionally, the bias terms for the highway block gates are initialized to negative values to promote the "carry" behavior.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input,Dense,Dropout,Activation
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.initializers import Constant
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,balanced_accuracy_score
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings("ignore")

In [2]:
## feature extraction using pretrained feature extractor
universal_embed = hub.load("../other/universal-sentence-encoder_4")
nnlm_embed = hub.load("../other/nnlm-en-dim128_2")

In [3]:
## Using Amazon review data
"""
all_feats = []
all_labels = []
with open("../data/amazon_reviews/amazon_reviews.txt") as data_file:
    for line in data_file.readlines():
        label = line[:10][-1]
        string = line[10:].strip().lower()
        
        if label=="2": # positive review
            all_labels.append(1)
        else: # negative review
            all_labels.append(0)
        
        string = np.expand_dims(np.asarray(string),axis=0)
        all_feats.append(np.hstack([universal_embed(string),nnlm_embed(string)]))

x = np.vstack(all_feats)
y = np.asarray(all_labels).astype("int32")
"""
x = np.load("../data/amazon_reviews/x.npy")
y = np.load("../data/amazon_reviews/y.npy")
print(x.shape,y.shape)

(400000, 640) (400000,)


In [4]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.05,random_state=2)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(380000, 640) (20000, 640) (380000,) (20000,)


### Highway Network Model

$$ y = H(x,W_h)*T(x,W_t) \;+\; x*(1-T(x,W_t)) $$

- x: input to the NN layer
- H(...): can be thought as the standard non-linear affine transformation (dense layer)
- T(...): gating mechanism (sigmoid)

In [5]:
def highway_block(x,out_dim):
    """ defines a single highway block
    """
    H_tran = Dense(out_dim,activation="relu")(x)
    T_gate = Dense(out_dim,activation="sigmoid",bias_initializer=Constant(value=-2))(x)
    output = (T_gate*H_tran)+((1-T_gate)*x)
    return output

In [6]:
def highway_network(input_dim=640,highway_dim=256,use_highway=True,num_layers=2):
    """ Defines forward pass of highway network
    """
    x = Input(shape=(input_dim))
    h = Dense(highway_dim,activation="relu")(x)
    for _ in range(num_layers):
        if use_highway:
            h = highway_block(h,out_dim=highway_dim)
        else:
            h = Dense(highway_dim,activation="relu")(h) # standard Dense layer
    pred = Dense(1,activation="sigmoid")(h)
    
    model = Model(inputs=[x],outputs=[pred])
    return model

In [7]:
def loss_function(y_true,y_pred):
    """ binary cross entropy loss
    """
    loss = BinaryCrossentropy()(y_true,y_pred)
    return loss

In [8]:
@tf.function
def train(model,optimizer,x_train_subset,y_train_subset):
    with tf.GradientTape() as tape:
        y_train_pred = model(x_train_subset)
        loss = loss_function(y_train_subset,y_train_pred)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

In [9]:
model = highway_network(highway_dim=128,use_highway=True,num_layers=5)

In [10]:
batch_size=25
optimizer=Adam(lr=0.001)
epochs=4

for epoch_i in range(epochs):
    losses = []
    for i in range(0,len(x_train),batch_size):
        x_train_subset = x_train[i:i+batch_size]
        y_train_subset = y_train[i:i+batch_size]
        batch_loss = train(model,optimizer,x_train_subset,y_train_subset)
        losses.append(float(batch_loss))
    
    print("epoch {}; loss:{}".format(epoch_i,round(sum(losses)/len(losses),4)))
    y_train_pred = model(x_train)
    y_train_pred = y_train_pred.numpy()
    y_train_pred[y_train_pred >= 0.5]=1 ; y_train_pred[y_train_pred < 0.5]=0
    
    y_test_pred = model(x_test)
    y_test_pred = y_test_pred.numpy()
    y_test_pred[y_test_pred >= 0.5]=1 ; y_test_pred[y_test_pred < 0.5]=0
    
    train_acc,train_bal_acc = round(accuracy_score(y_train,y_train_pred),4),round(balanced_accuracy_score(y_train,y_train_pred),4)
    test_acc,test_bal_acc = round(accuracy_score(y_test,y_test_pred),4),round(balanced_accuracy_score(y_test,y_test_pred),4)
    
    print("-Train; accuracy:{}; bal_accuracy:{}".format(train_acc,train_bal_acc))
    print("-Test; accuracy:{}; bal_accuracy:{}".format(test_acc,test_bal_acc))

epoch 0; loss:0.2699
-Train; accuracy:0.896; bal_accuracy:0.896
-Test; accuracy:0.8878; bal_accuracy:0.8879
epoch 1; loss:0.2475
-Train; accuracy:0.9019; bal_accuracy:0.9019
-Test; accuracy:0.8904; bal_accuracy:0.8906
epoch 2; loss:0.2363
-Train; accuracy:0.9063; bal_accuracy:0.9063
-Test; accuracy:0.8935; bal_accuracy:0.8937
epoch 3; loss:0.2265
-Train; accuracy:0.91; bal_accuracy:0.91
-Test; accuracy:0.8943; bal_accuracy:0.8945
