## Implementation of deep domain confusion

This is based on the paper: <i>Deep Domain Confusion: Maximizing for Domain Invariance</i>. In this case I am not fine-tuning a pretrained model but rather using a pretrained feature extractor. The task is sentiment analysis in which the target domain (Yelp reviews) has no labeled data (Amazon reviews are the source).

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input,Dense,Dropout,Activation
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.initializers import Constant
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,balanced_accuracy_score
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings("ignore")

In [2]:
## feature extraction using pretrained feature extractor
universal_embed = hub.load("../other/universal-sentence-encoder_4")
nnlm_embed = hub.load("../other/nnlm-en-dim128_2")

In [3]:
## Getting the Yelp review data
"""
all_yelp_reviews = pd.read_csv('../data/yelp_reviews/train.csv')
all_feats = []
all_labels = []
for _,row in all_yelp_reviews.iterrows():
    label,string = int(row['score'])-1,row['review'].strip().lower()
    string = np.expand_dims(np.asarray(string),axis=0)
    all_feats.append(np.hstack([universal_embed(string),nnlm_embed(string)]))
    all_labels.append(label)

x = np.vstack(all_feats)
y = np.asarray(all_labels).astype("int32")
"""
yelp_x = np.load("../data/yelp_reviews/x.npy")
yelp_y = np.load("../data/yelp_reviews/y.npy")
yelp_x = yelp_x[:10000]
yelp_y = yelp_y[:10000]
print(yelp_x.shape,yelp_y.shape)

(10000, 640) (10000,)


In [4]:
## Getting the Amazon review data
amazon_x = np.load("../data/amazon_reviews/x.npy")
amazon_y = np.load("../data/amazon_reviews/y.npy")
amazon_x,amazon_y = amazon_x[:11000],amazon_y[:11000]
amazon_x_val,amazon_y_val = amazon_x[10000:],amazon_y[10000:]
amazon_x,amazon_y = amazon_x[:10000],amazon_y[:10000]
print(amazon_x.shape,amazon_y.shape,amazon_x_val.shape,amazon_y_val.shape)

(10000, 640) (10000,) (1000, 640) (1000,)


### Model implementation

As defined in <i>Beyond Sharing Weights for Deep Domain Adaptation<i>:
$$ MMD^2({f^s_i},{f^t_j}) = \sum_{i,i^`}\frac{k(f^s_i,f^s_{i^`})}{(N^s)^2} + \sum_{j,j^`}\frac{k(f^t_j,f^t_{j^`})}{(N^t)^2} - 2 \sum_{i,j}\frac{k(f^s_i,f^t_j)}{N^s N^t} $$

$$ k(u,v) = exp(-|| u-v ||^2 / \sigma) $$

In [5]:
def get_model(input_dim=640):
    """ model implementation
        -also returns the last hidden state (to use w/ MMD loss component)
        -using the h2 vector prior to the relu activation makes no difference in practice
    """
    x = Input(shape=(input_dim))
    h1 = Dense(512,activation='relu')(x)
    h2 = Dense(256,activation='relu')(h1)
    out = Dense(1,activation='sigmoid')(h2)
    
    model = Model(inputs=x,outputs=[out,h2])
    return model

In [6]:
def get_mmd_unit(mat1,mat2,sigma=1,c=1e-18):
    """ calculates MMD components
        -c is used to stabilize gradient flow
    """
    mat1 = tf.expand_dims(mat1,axis=0) # done for tf broadcasting
    mat2 = tf.expand_dims(mat2,axis=1)
    diff = tf.reshape(tf.subtract(mat1,mat2),[-1,2]) # difference between all rows in mat1 and mat2, stacked
    squared_euclid_distance = tf.sqrt(tf.reduce_sum(tf.square(diff),axis=1)+c)
    kernel_sum = tf.reduce_mean(tf.exp(-squared_euclid_distance/sigma)) # calculating RBF kernel
    return kernel_sum

In [7]:
def ddc_loss(amazon_y_subset,amazon_pred,amazon_h,yelp_h,mmd_lam):
    """ DDC loss implementation
    args:
        mmd_lam: amount to scale MMD loss component by
    """
    class_loss = BinaryCrossentropy()(amazon_y_subset,amazon_pred) # automatic avg over batch
    mmd_loss = get_mmd_unit(amazon_h,amazon_h)+get_mmd_unit(yelp_h,yelp_h)-2*get_mmd_unit(yelp_h,amazon_h)
    total_loss = class_loss+(mmd_lam*mmd_loss)
    return total_loss

In [8]:
@tf.function
def train_model(model,optimizer,amazon_x_subset,amazon_y_subset,yelp_x_subset,mmd_lam=0.25):
    """ used to train the model
    """
    with tf.GradientTape() as tape:
        amazon_pred,amazon_h = model(amazon_x_subset)
        _,yelp_h = model(yelp_x_subset)
        loss = ddc_loss(amazon_y_subset,amazon_pred,amazon_h,yelp_h,mmd_lam)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

In [9]:
model = get_model()

In [10]:
# In practice the MMD loss seems to helps stabilize the learning so that the Yelp performance degrades less during training
batch_size=50
optimizer = Adam(lr=0.01)
epochs=3

for epoch_i in range(epochs):
    losses = []
    for i in range(0,len(amazon_x),batch_size):
        x_train_subset = amazon_x[i:i+batch_size]
        y_train_subset = amazon_y[i:i+batch_size]
        yelp_x_subset = yelp_x[i:i+batch_size]
        batch_loss = train_model(model,optimizer,x_train_subset,y_train_subset,yelp_x_subset)
        losses.append(float(batch_loss))
    
    print("epoch {}; loss:{}".format(epoch_i,round(sum(losses)/len(losses),4)))
    y_train_pred,_ = model(amazon_x)
    y_train_pred = y_train_pred.numpy()
    y_train_pred[y_train_pred >= 0.5]=1 ; y_train_pred[y_train_pred < 0.5]=0
    
    y_val_pred,_ = model(amazon_x_val)
    y_val_pred = y_val_pred.numpy()
    y_val_pred[y_val_pred >= 0.5]=1 ; y_val_pred[y_val_pred < 0.5]=0
    
    yelp_pred,_ = model(yelp_x)
    yelp_pred = yelp_pred.numpy()
    yelp_pred[yelp_pred >= 0.5]=1 ; yelp_pred[yelp_pred < 0.5]=0
    
    train_acc,train_bal_acc = round(accuracy_score(amazon_y,y_train_pred),4),round(balanced_accuracy_score(amazon_y,y_train_pred),4)    
    val_acc,val_bal_acc = round(accuracy_score(amazon_y_val,y_val_pred),4),round(balanced_accuracy_score(amazon_y_val,y_val_pred),4)
    yelp_acc,yelp_bal_acc = round(accuracy_score(yelp_y,yelp_pred),4),round(balanced_accuracy_score(yelp_y,yelp_pred),4)
    
    print("-Train; accuracy:{}; bal_accuracy:{}".format(train_acc,train_bal_acc))
    print("-Test; accuracy:{}; bal_accuracy:{}".format(val_acc,val_bal_acc))
    print("-YELP; accuracy:{}; bal_accuracy:{}".format(yelp_acc,yelp_bal_acc))

epoch 0; loss:0.3808
-Train; accuracy:0.8855; bal_accuracy:0.8859
-Test; accuracy:0.874; bal_accuracy:0.8734
-YELP; accuracy:0.8698; bal_accuracy:0.8724
epoch 1; loss:0.3009
-Train; accuracy:0.8956; bal_accuracy:0.8958
-Test; accuracy:0.877; bal_accuracy:0.8763
-YELP; accuracy:0.8625; bal_accuracy:0.8661
epoch 2; loss:0.2752
-Train; accuracy:0.8995; bal_accuracy:0.9003
-Test; accuracy:0.88; bal_accuracy:0.8788
-YELP; accuracy:0.8708; bal_accuracy:0.8713


### Baseline model

In [11]:
def get_standard_model(input_dim=640):
    x = Input(shape=(input_dim))
    h = Dense(512,activation='relu')(x)
    h = Dense(256,activation='relu')(h)
    out = Dense(1,activation='sigmoid')(h)
    model = Model(inputs=x,outputs=out)
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=0.01)) # train_on_batch()
    return model

In [12]:
model = get_standard_model()

In [13]:
batch_size=50
epochs=10

for epoch_i in range(epochs):
    losses = []
    for i in range(0,len(amazon_x),batch_size):
        x_train_subset = amazon_x[i:i+batch_size]
        y_train_subset = amazon_y[i:i+batch_size]
        batch_loss = model.train_on_batch(x_train_subset,y_train_subset)
        losses.append(float(batch_loss))
    
    print("epoch {}; loss:{}".format(epoch_i,round(sum(losses)/len(losses),4)))
    y_train_pred = model(amazon_x).numpy()
    y_train_pred[y_train_pred >= 0.5]=1 ; y_train_pred[y_train_pred < 0.5]=0
    
    y_val_pred = model(amazon_x_val).numpy()
    y_val_pred[y_val_pred >= 0.5]=1 ; y_val_pred[y_val_pred < 0.5]=0
    
    yelp_pred = model(yelp_x).numpy()
    yelp_pred[yelp_pred >= 0.5]=1 ; yelp_pred[yelp_pred < 0.5]=0
    
    train_acc,train_bal_acc = round(accuracy_score(amazon_y,y_train_pred),4),round(balanced_accuracy_score(amazon_y,y_train_pred),4)    
    val_acc,val_bal_acc = round(accuracy_score(amazon_y_val,y_val_pred),4),round(balanced_accuracy_score(amazon_y_val,y_val_pred),4)
    yelp_acc,yelp_bal_acc = round(accuracy_score(yelp_y,yelp_pred),4),round(balanced_accuracy_score(yelp_y,yelp_pred),4)
    
    print("-Train; accuracy:{}; bal_accuracy:{}".format(train_acc,train_bal_acc))
    print("-Test; accuracy:{}; bal_accuracy:{}".format(val_acc,val_bal_acc))
    print("-YELP; accuracy:{}; bal_accuracy:{}".format(yelp_acc,yelp_bal_acc))