Prototype a model with a switching module that can selectively use predicted probability by phrase-only and visual-only model.

## idea 2

In [11]:
import chainer
import chainer.links as L
import chainer.functions as F
from chainer import initializers

class BaseNet(chainer.Chain):
    def __init__(self, h_size):
        super(BaseNet, self).__init__()
        w = initializers.HeNormal()
        with self.init_scope():
            self.l_0 = L.Linear(None, h_size, initialW=w, nobias=True)
            self.l_1 = L.Linear(None, h_size, initialW=w, nobias=True)
            self.bn_0 = L.BatchNormalization(h_size)
            self.bn_1 = L.BatchNormalization(h_size)
            self.cls = L.Linear(None, 1)
    
    def __call__(self, x0, x1):
        h0 = F.relu(self.bn_0(self.l_0(x0)))
        h1 = F.relu(self.bn_0(self.l_0(x1)))
        
        h = F.relu(self.bn_1(self.l_1(h0) + self.l_1(h1)))
        h = self.cls(h)
        return h

class GateNet(chainer.Chain):
    def __init__(self, h_size):
        super(GateNet, self).__init__()
        w = initializers.HeNormal()
        with self.init_scope():
            self.l_l0 = L.Linear(None, h_size, initialW=w, nobias=True)
            self.l_l1= L.Linear(h_size, h_size, initialW=w, nobias=True)
            self.l_v0 = L.Linear(None, h_size, initialW=w, nobias=True)
            self.l_v1= L.Linear(h_size, h_size, initialW=w, nobias=True)
            self.l= L.Linear(h_size, 1, initialW=w)
            
            self.bn_l = L.BatchNormalization(h_size)
            self.bn_v = L.BatchNormalization(h_size)
            self.bn_vl = L.BatchNormalization(h_size)
    
    def __call__(self, x_p1, x_p2, x_i1, x_i2):
        hl = F.relu(self.bn_l(self.l_l0(x_p1)+self.l_l0(x_p2)))
        hv = F.relu(self.bn_v(self.l_v0(x_i1)+self.l_v0(x_i2)))
        h = F.relu(self.bn_vl(self.l_l1(hl) + self.l_v1(hv)))
        w = F.sigmoid(self.l(h))
        return w
        
            
class LateSwitching_iParaphraseNet(chainer.Chain):
    def __init__(self):
        super(LateSwitching_iParaphraseNet, self).__init__()
        with self.init_scope():
            self.language_net = BaseNet(1000)
            self.vision_net = BaseNet(1000)
            self.gate_net = GateNet(128)
            
    def __call__(self, phr_1, phr_2, vis_1, vis_2, l):
        y_l = self.language_net(phr_1, phr_2)
        y_v = self.vision_net(vis_1, vis_2)
        
        w = self.gate_net(phr_1, phr_2, vis_1, vis_2)
        y = w * y_l + (1-w)*y_v
        y = F.flatten(y)

        if chainer.config.train == False:
            self.y = F.sigmoid(y)
            self.t = l

        loss = F.sigmoid_cross_entropy(y, l)

#         precision, recall, fbeta = binary_classification_summary(y, l)
#         reporter.report({
#             'loss': loss,
#             'precision': precision,
#             'recall': recall,
#             'f1': fbeta
#         }, self)

        return loss


In [12]:
model = LateSwitching_iParaphraseNet()

In [13]:
import numpy as np

x_p1 = np.random.random((10, 300)).astype('f')
x_p2 = np.random.random((10, 300)).astype('f')

x_v1 = np.random.random((10, 1000)).astype('f')
x_v2 = np.random.random((10, 1000)).astype('f')

l = (np.random.random((10,)) > .5).astype('i')

In [14]:
model(x_p1, x_p2, x_v1, x_v2, l)

variable(0.68231475)