Prototype a model with a switching module that can selectively incorporate visual clues based on the language clues (and image?).

## idea 1
Gate-like architecture

c = sigma(g_f(x_lng) * (x_lng) + g_i(x_lng) * (x_vis))
p = sigmoid(Wc)

In [24]:
import chainer
import chainer.links as L
import chainer.functions as F
from chainer import initializers

class BaseNet(chainer.Chain):
    def __init__(self, out_size):
        super(BaseNet, self).__init__()
        w = initializers.HeNormal()
        with self.init_scope():
            self.l_0 = L.Linear(None, out_size, initialW=w, nobias=True)
            self.l_1 = L.Linear(None, out_size, initialW=w, nobias=True)
            self.bn_0 = L.BatchNormalization(out_size)
            self.bn_1 = L.BatchNormalization(out_size)
    
    def __call__(self, x0, x1):
        h = F.relu(self.bn_0(self.l_0(x0) + self.l_0(x1)))
        h = F.tanh(self.bn_1(self.l_1(h)))
        return h

class GatedClassifierNet(chainer.Chain):
    def __init__(self):
        super(GatedClassifierNet, self).__init__()
        with self.init_scope():
            self.l_1 = L.Linear(None, 128, initialW=initializers.HeNormal(), nobias=True)
            self.bn_1 = L.BatchNormalization(128)
            
            self.cls = L.Linear(None, 1, initialW=initializers.LeCunNormal())
            
            self.l_forget = L.Linear(None, 128, nobias=True)
            self.l_input = L.Linear(None, 128, nobias=True)

    def __call__(self, x_p, x_v, l):
        # gates
        i = F.sigmoid(self.l_input(x_p))
        f = F.sigmoid(self.l_forget(x_p))
        
        h = f * x_p + i * x_v
        
        h = F.relu(self.bn_1(self.l_1(h)))
        h = self.cls(h)
        h = F.flatten(h)

        if chainer.config.train == False:
            self.y = F.sigmoid(h)
            self.t = l

        loss = F.sigmoid_cross_entropy(h, l)

        precision, recall, fbeta = binary_classification_summary(h, L)
        reporter.report({
            'loss': loss,
            'precision': precision,
            'recall': recall,
            'f1': fbeta
        }, self)

        return loss

class Switching_iParaphraseNet(chainer.Chain):
    def __init__(self):
        super(Switching_iParaphraseNet, self).__init__()
        with self.init_scope():
            self.phrase_net = BaseNet(128)
            self.vision_net = BaseNet(128)
            
            self.classifier = GatedClassifierNet()

    def predict(self, phr_1, phr_2, xvis_1, xvis_2, l):
        _ = self(phr_1, phr_2, xvis_1, xvis_2, l)
        y = self.classifier.y
        return y

    def __call__(self, phr_1, phr_2, vis_1, vis_2, l):
        h_p = self.phrase_net(phr_1, phr_2)
        h_v = self.vision_net(vis_1, vis_2)
        
        loss = self.classifier(h_p, h_v, l)
        
        return loss
    


In [25]:
model = Switching_iParaphraseNet()

In [26]:
import numpy as np

x_p1 = np.random.random((10, 300)).astype('f')
x_p2 = np.random.random((10, 300)).astype('f')

x_v1 = np.random.random((10, 1000)).astype('f')
x_v2 = np.random.random((10, 1000)).astype('f')

l = (np.random.random((10,)) > .5).astype('i')

In [27]:
model(x_p1, x_p2, x_v1, x_v2, l)

variable(0.76262265)