piskvorky · aneesh-joshi · Feb 8, 2018 · Feb 14, 2018 · Feb 18, 2018 · Feb 18, 2018
diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
@@ -68,6 +68,7 @@ Modules:
     models/deprecated/keyedvectors
     models/deprecated/fasttext_wrapper
     models/base_any2vec
+    models/experimental/drmm_tks
     similarities/docsim
     similarities/index
     sklearn_api/atmodel

diff --git a/docs/src/models/experimental/drmm_tks.rst b/docs/src/models/experimental/drmm_tks.rst
@@ -0,0 +1,9 @@
+:mod:`models.experimental.drmm_tks` -- Similarity Learning
+============================================================================
+
+.. automodule:: gensim.models.experimental.drmm_tks
+    :synopsis: Neural Network Similarity Learning
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py
@@ -24,6 +24,7 @@
 
 from . import wrappers  # noqa:F401
 from . import deprecated  # noqa:F401
+from . import experimental  # noqa:F401
 
 from gensim import interfaces, utils
 

diff --git a/gensim/models/experimental/UI_Example.ipynb b/gensim/models/experimental/UI_Example.ipynb
diff --git a/gensim/models/experimental/__init__.py b/gensim/models/experimental/__init__.py
@@ -0,0 +1,6 @@
+"""This package will host some experimental modules for Similarity Learning"""
+
+from .drmm_tks import DRMM_TKS  # noqa:F401
+from .custom_losses import rank_hinge_loss  # noqa:F401
+from .custom_layers import TopKLayer  # noqa:F401
+from .custom_callbacks import ValidationCallback  # noqa:F401
diff --git a/gensim/models/experimental/custom_callbacks.py b/gensim/models/experimental/custom_callbacks.py
@@ -0,0 +1,73 @@
+import logging
+try:
+    from keras.callbacks import Callback
+    KERAS_AVAILABLE = True
+except ImportError:
+    KERAS_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+
+
+class ValidationCallback(Callback):
+    """Callback for providing validation metrics on the model trained so far"""
+    def __init__(self, test_data):
+        """
+        Parameters
+        ----------
+        test_data : dict
+            A dictionary which holds the validation data
+            It consists of the following keys:
+                "X1" : numpy array
+                    The queries as a numpy array of shape (n_samples, text_maxlen)
+                "X2" : numpy array
+                    The candidate docs as a numpy array of shape (n_samples, text_maxlen)
+                "y" : list of int
+                      It is the labels for each of the query-doc pairs as a 1 or 0 with shape (n_samples,)
+                      where 1: doc is relevant to query
+                            0: doc is not relevant to query
+                "doc_lengths" : list of int
+                                It contains the length of each document group. I.e., the number of queries
+                                which represent one topic. It is needed for calculating the metrics.
+        """
+
+        if not KERAS_AVAILABLE:
+            raise ImportError("Please install Keras to use this class")
+
+        # Check if all test_data is a dicitonary with all the right keys
+        try:
+            # If an empty dict is passed
+            if len(test_data.keys()) == 0:
+                raise ValueError(
+                      "test_data dictionary is empty. It doesn't have the keys: 'X1', 'X2', 'y', 'doc_lengths'"
+                    )
+            for key in test_data.keys():
+                if key not in ['X1', 'X2', 'y', 'doc_lengths']:
+                    raise ValueError("test_data dictionary doesn't have the  keys: 'X1', 'X2', 'y', 'doc_lengths'")
+        except AttributeError:
+            raise ValueError("test_data must be a dictionary with the keys: 'X1', 'X2', 'y', 'doc_lengths'")
+        self.test_data = test_data
+
+    def on_epoch_end(self, epoch, logs={}):
+        # Import has to be here to prevent cyclic import
+        from evaluation_metrics import mapk, mean_ndcg
+        X1 = self.test_data["X1"]
+        X2 = self.test_data["X2"]
+        y = self.test_data["y"]
+        doc_lengths = self.test_data["doc_lengths"]
+
+        predictions = self.model.predict(x={"query": X1, "doc": X2})
+
+        Y_pred = []
+        Y_true = []
+        offset = 0
+
+        for doc_size in doc_lengths:
+            Y_pred.append(predictions[offset: offset + doc_size])
+            Y_true.append(y[offset: offset + doc_size])
+            offset += doc_size
+
+        logger.info("MAP: %.2f", mapk(Y_true, Y_pred))
+        for k in [1, 3, 5, 10, 20]:
+            logger.info("nDCG@%d : %.2f", k, mean_ndcg(Y_true, Y_pred, k=k))
diff --git a/gensim/models/experimental/custom_layers.py b/gensim/models/experimental/custom_layers.py
@@ -0,0 +1,42 @@
+try:
+    from keras.engine.topology import Layer
+    import keras.backend as K
+    KERAS_AVAILABLE = True
+except ImportError:
+    KERAS_AVAILABLE = False
+
+"""Script where all the custom keras layers are kept."""
+
+
+class TopKLayer(Layer):
+    """Layer to get top k values from the interaction matrix in drmm_tks model"""
+    def __init__(self, output_dim, topk, **kwargs):
+        """
+
+        Parameters
+        ----------
+        output_dim : tuple of ints
+            The dimension of the tensor after going through this layer
+        topk : int
+            The k topmost values to be returned
+        """
+        self.output_dim = output_dim
+        self.topk = topk
+        super(TopKLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        super(TopKLayer, self).build(input_shape)
+
+    def call(self, x):
+        return K.tf.nn.top_k(x, k=self.topk, sorted=True)[0]
+
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], self.output_dim[0], self.output_dim[1])
+
+    def get_config(self):
+        config = {
+            'topk': self.topk,
+            'output_dim': self.output_dim
+        }
+        base_config = super(TopKLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/gensim/models/experimental/custom_losses.py b/gensim/models/experimental/custom_losses.py
@@ -0,0 +1,29 @@
+try:
+    from keras import backend as K
+    from keras.layers import Lambda
+    KERAS_AVAILABLE = True
+except ImportError:
+    KERAS_AVAILABLE = False
+
+"""Script where all the custom loss functions will be defined"""
+
+
+def rank_hinge_loss(y_true, y_pred):
+    """Loss function for Ranking Similarity Learning tasks
+    More details here : https://en.wikipedia.org/wiki/Hinge_loss
+
+    Parameters
+    ----------
+    y_true : list of list of int
+        The true relation between a query and a doc
+        It can be either 1 : relevant or 0 : not relevant
+    y_pred : list of list of float
+        The predicted relation between a query and a doc
+    """
+    if not KERAS_AVAILABLE:
+        raise ImportError("Please install Keras to use this function")
+    margin = 0.5
+    y_pos = Lambda(lambda a: a[::2, :], output_shape=(1,))(y_pred)
+    y_neg = Lambda(lambda a: a[1::2, :], output_shape=(1,))(y_pred)
+    loss = K.maximum(0., margin + y_neg - y_pos)
+    return K.mean(loss)