diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 29f91d9f47..a7d3b19ded 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -68,6 +68,11 @@ Modules: models/deprecated/keyedvectors models/deprecated/fasttext_wrapper models/base_any2vec + models/experimental/drmm_tks + models/experimental/custom_callbacks + models/experimental/custom_layers + models/experimental/custom_losses + models/experimental/evaluation_metrics similarities/docsim similarities/index sklearn_api/atmodel diff --git a/docs/src/models/experimental/custom_callbacks.rst b/docs/src/models/experimental/custom_callbacks.rst new file mode 100644 index 0000000000..4fdf371992 --- /dev/null +++ b/docs/src/models/experimental/custom_callbacks.rst @@ -0,0 +1,9 @@ +:mod:`models.experimental.custom_callbacks` -- Custom Callbacks for Similarity Learning +======================================================================================= + +.. automodule:: gensim.models.experimental.custom_callbacks + :synopsis: Custom Callbacks for Similarity Learning + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/models/experimental/custom_layers.rst b/docs/src/models/experimental/custom_layers.rst new file mode 100644 index 0000000000..51cc70f63e --- /dev/null +++ b/docs/src/models/experimental/custom_layers.rst @@ -0,0 +1,9 @@ +:mod:`models.experimental.custom_layers` -- Custom Layers for Similarity Learning +================================================================================= + +.. automodule:: gensim.models.experimental.custom_layers + :synopsis: Custom Layers for Similarity Learning + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/models/experimental/custom_losses.rst b/docs/src/models/experimental/custom_losses.rst new file mode 100644 index 0000000000..f59afcfaa5 --- /dev/null +++ b/docs/src/models/experimental/custom_losses.rst @@ -0,0 +1,9 @@ +:mod:`models.experimental.custom_losses` -- Loss for Similarity Learning +======================================================================== + +.. automodule:: gensim.models.experimental.custom_losses + :synopsis: Loss functions for Similarity Learning + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/models/experimental/drmm_tks.rst b/docs/src/models/experimental/drmm_tks.rst new file mode 100644 index 0000000000..d569eac61c --- /dev/null +++ b/docs/src/models/experimental/drmm_tks.rst @@ -0,0 +1,9 @@ +:mod:`models.experimental.drmm_tks` -- Neural Nets for Similarity Learning +========================================================================== + +.. automodule:: gensim.models.experimental.drmm_tks + :synopsis: Neural Network Similarity Learning + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/models/experimental/evaluation_metrics.rst b/docs/src/models/experimental/evaluation_metrics.rst new file mode 100644 index 0000000000..2d47acd9c7 --- /dev/null +++ b/docs/src/models/experimental/evaluation_metrics.rst @@ -0,0 +1,9 @@ +:mod:`models.experimental.evaluation_metrics` -- Evaluation Metrics for Similarity Learning +=========================================================================================== + +.. automodule:: gensim.models.experimental.evaluation_metrics + :synopsis: Evaluation Metrics for Similarity Learning + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index 4114724027..5e48318a9e 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -24,6 +24,7 @@ from . import wrappers # noqa:F401 from . import deprecated # noqa:F401 +from . import experimental # noqa:F401 from gensim import interfaces, utils diff --git a/gensim/models/experimental/UI_Example.ipynb b/gensim/models/experimental/UI_Example.ipynb new file mode 100644 index 0000000000..25c9e0a658 --- /dev/null +++ b/gensim/models/experimental/UI_Example.ipynb @@ -0,0 +1,795 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting the dataset\n", + "A script has been provided to download all the datasets required for running the below examples.\n", + "It will dowload and unzip the WikiQA Corpus and the Quora Duplicate Questions dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python experimental_data/get_data.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Installing dependencies for running the Similarity Learning task" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n", + "2018-07-06 00:34:19,104 : INFO : 'pattern' package not found; tag filters are not available for English\n" + ] + } + ], + "source": [ + "import os\n", + "import csv\n", + "import re\n", + "from gensim.models.experimental import DRMM_TKS\n", + "from gensim.utils import simple_preprocess" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Format" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have to provide data in a format which is understood by the model.\n", + "The model understands sentences as a list of words. \n", + "Further, we need to give a :\n", + " 1. Queries List\n", + " 2. Candidate Document List\n", + " 3. Correct Label List\n", + "\n", + "1 is a list of list of words\n", + "2 and 3 is actually a list of list of list of words/ints\n", + "\n", + "Example:\n", + "```\n", + "queries = [\"When was Abraham Lincoln born ?\".split(), \n", + " \"When was the first World War ?\".split()]\n", + "docs = [\n", + "\t\t [\"Abraham Lincoln was the president of the United States of America\".split(),\n", + "\t\t \"He was born in 1809\".split()],\n", + "\t\t [\"The first world war was bad\".split(),\n", + "\t\t \"It was fought in 1914\".split(),\n", + "\t\t \"There were over a million deaths\".split()]\n", + " ]\n", + "labels = [[0,\n", + " 1],\n", + "\t\t [0,\n", + " 1,\n", + " 0]\n", + " ]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## About the dataset : WikiQA\n", + "\n", + "The WikiQA corpus is a set of question-answer pairs in which for every query there are several candidate documents of which none, one or more documents might be relevant.\n", + "Relevance is purely binary, i.e., 1: relavant, 0: not relevant\n", + "\n", + "Sample data:\n", + "\n", + "QuestionID | Question | DocumentID | DocumentTitle | SentenceID | Sentence | Label\n", + "-- | -- | -- | -- | -- | -- | --\n", + "Q1 | how are glacier caves formed? | D1 | Glacier cave | D1-0 | A partly submerged glacier cave on Perito Moreno Glacier . | 0\n", + "Q1 | how are glacier caves formed? | D1 | Glacier cave | D1-1 | The ice facade is approximately 60 m high | 0\n", + "Q1 | how are glacier caves formed? | D1 | Glacier cave | D1-2 | Ice formations in the Titlis glacier cave | 0\n", + "Q1 | how are glacier caves formed? | D1 | Glacier cave | D1-3 | A glacier cave is a cave formed within the ice of a glacier . | 1\n", + "Q1 | how are glacier caves formed? | D1 | Glacier cave | D1-4 | Glacier caves are often called ice caves , but this term is properly used to describe bedrock caves that contain year-round ice. | 0\n", + "Q2 | How are the directions of the velocity and force vectors related in a circular motion | D2 | Circular motion | D2-0 | In physics , circular motion is a movement of an object along the circumference of a circle or rotation along a circular path. | 0\n", + "Q2 | How are the directions of the velocity and force vectors related in a circular motion | D2 | Circular motion | D2-1 | It can be uniform, with constant angular rate of rotation (and constant speed), or non-uniform with a changing rate of rotation. | 0\n", + "Q2 | How are the directions of the velocity and force vectors related in a circular motion | D2 | Circular motion | D2-2 | The rotation around a fixed axis of a three-dimensional body involves circular motion of its parts. | 0\n", + "Q2 | How are the directions of the velocity and force vectors related in a circular motion | D2 | Circular motion | D2-3 | The equations of motion describe the movement of the center of mass of a body. | 0\n", + "Q2 | How are the directions of the velocity and force vectors related in a circular motion | D2 | Circular motion | D2-4 | Examples of circular motion include: an artificial satellite orbiting the Earth at constant height, a stone which is tied to a rope and is being swung in circles, a car turning through a curve in a race track , an electron moving perpendicular to a uniform magnetic field , and a gear turning inside a mechanism. | 0\n", + "Q2 | How are the directions of the velocity and force vectors related in a circular motion | D2 | Circular motion | D2-5 | Since the object's velocity vector is constantly changing direction, the moving object is undergoing acceleration by a centripetal force in the direction of the center of rotation. | 0\n", + "Q2 | How are the directions of the velocity and force vectors related in a circular motion | D2 | Circular motion | D2-6 | Without this acceleration, the object would move in a straight line, according to Newton's laws of motion . | 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preprocessing\n", + "We need to take the above text and make it into `queries, docs, labels` form. For this, we will create an iterable object with the below class which will allow the data to be streamed into the model as the need arises." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "class MyWikiIterable:\n", + " \"\"\"\"Yields the next data point in the data set based on the `iter_type`\n", + " \n", + " Based on `iter_type` the object can yield the following:\n", + " 'query' : list of str words\n", + " 'doc' : list of docs\n", + " where a doc is a list of str words\n", + " 'label' : list of int\n", + " The relevance between adjacent queries and docs\n", + " \"\"\"\n", + "\n", + " def __init__(self, iter_type, fpath):\n", + " \"\"\"\n", + " Parameters\n", + " ----------\n", + " iter_type : {'query', 'doc', 'label'}\n", + " the type of iterable to be yielded\n", + " fpath : str\n", + " path to the dataset\n", + " \"\"\"\n", + "\n", + " # To map the `iter_type` to an index\n", + " self.type_translator = {'query': 0, 'doc': 1, 'label': 2}\n", + " self.iter_type = iter_type\n", + "\n", + " with open(fpath, encoding='utf8') as tsv_file:\n", + " tsv_reader = csv.reader(tsv_file, delimiter='\\t', quoting=csv.QUOTE_NONE)\n", + " self.data_rows = []\n", + " self.data_rows = [row for row in tsv_reader]\n", + "\n", + " def preprocess_sent(self, sent):\n", + " \"\"\"Utility function to lower, strip and tokenize each sentence\n", + " Replace this function if you want to handle preprocessing differently\"\"\"\n", + "\n", + " return simple_preprocess(sent)\n", + "\n", + " def __iter__(self):\n", + " # Defining some consants for .tsv reading\n", + " # They represent the columns of the respective values\n", + " QUESTION_ID_INDEX = 0\n", + " QUESTION_INDEX = 1\n", + " ANSWER_INDEX = 5\n", + " LABEL_INDEX = 6\n", + "\n", + "\n", + " # The group of documents and labels that belong to one question\n", + " document_group = []\n", + " label_group = []\n", + "\n", + " # Number of relevant documents per query\n", + " n_relevant_docs = 0\n", + " # Number of filtered docs (query-doc pairs which have zero relevant docs)\n", + " n_filtered_docs = 0\n", + "\n", + " # The data\n", + " queries = []\n", + " docs = []\n", + " labels = []\n", + "\n", + " # The code below goes through the data line by line\n", + " # It checks the current document id with the next document id\n", + " for i, line in enumerate(self.data_rows[1:], start=1):\n", + " if i < len(self.data_rows) - 1: # check if out of bounds might occur\n", + " if self.data_rows[i][QUESTION_ID_INDEX] == self.data_rows[i + 1][QUESTION_ID_INDEX]:\n", + " document_group.append(self.preprocess_sent(self.data_rows[i][ANSWER_INDEX]))\n", + " label_group.append(int(self.data_rows[i][LABEL_INDEX]))\n", + " n_relevant_docs += int(self.data_rows[i][LABEL_INDEX])\n", + " else:\n", + " document_group.append(self.preprocess_sent(self.data_rows[i][ANSWER_INDEX]))\n", + " label_group.append(int(self.data_rows[i][LABEL_INDEX]))\n", + "\n", + " n_relevant_docs += int(self.data_rows[i][LABEL_INDEX])\n", + "\n", + " if n_relevant_docs > 0:\n", + " docs.append(document_group)\n", + " labels.append(label_group)\n", + " queries.append(self.preprocess_sent(self.data_rows[i][QUESTION_INDEX]))\n", + "\n", + " yield [queries[-1], document_group, label_group][self.type_translator[self.iter_type]]\n", + " else:\n", + " n_filtered_docs += 1\n", + "\n", + " n_relevant_docs = 0\n", + " document_group = []\n", + " label_group = []\n", + "\n", + " else:\n", + " # If we are on the last line\n", + " document_group.append(self.preprocess_sent(self.data_rows[i][ANSWER_INDEX]))\n", + " label_group.append(int(self.data_rows[i][LABEL_INDEX]))\n", + " n_relevant_docs += int(self.data_rows[i][LABEL_INDEX])\n", + "\n", + " if n_relevant_docs > 0:\n", + " docs.append(document_group)\n", + " labels.append(label_group)\n", + " queries.append(self.preprocess_sent(self.data_rows[i][QUESTION_INDEX]))\n", + " yield [queries[-1], document_group, label_group][self.type_translator[self.iter_type]]\n", + " else:\n", + " n_filtered_docs += 1\n", + " n_relevant_docs = 0\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, will use the class to create objects of the training iterable" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "q_iterable = MyWikiIterable('query', os.path.join( 'experimental_data', 'WikiQACorpus', 'WikiQA-train.tsv'))\n", + "d_iterable = MyWikiIterable('doc', os.path.join('experimental_data', 'WikiQACorpus', 'WikiQA-train.tsv'))\n", + "l_iterable = MyWikiIterable('label', os.path.join('experimental_data', 'WikiQACorpus', 'WikiQA-train.tsv'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will also initialize some validation iterables\n", + "Note: the path has `dev` in it" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "q_val_iterable = MyWikiIterable('query', os.path.join( 'experimental_data', 'WikiQACorpus', 'WikiQA-dev.tsv'))\n", + "d_val_iterable = MyWikiIterable('doc', os.path.join('experimental_data', 'WikiQACorpus', 'WikiQA-dev.tsv'))\n", + "l_val_iterable = MyWikiIterable('label', os.path.join('experimental_data', 'WikiQACorpus', 'WikiQA-dev.tsv'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using word embeddings\n", + "We also need to get the word embeddings for the training. For this, we will use the Glove Embeddings.\n", + "Luckily, [gensim-data](https://github.com/RaRe-Technologies/gensim-data) provides an easy interface for it.\n", + "\n", + "We will use the [KeyedVectors](https://radimrehurek.com/gensim/models/keyedvectors.html) object that we for from gensim-data api and pass it as the `word_embedding` parameter in the model." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-07-06 00:34:23,010 : INFO : loading projection weights from /home/aneeshj/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz\n", + "2018-07-06 00:36:07,145 : INFO : loaded (400000, 300) matrix from /home/aneeshj/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz\n" + ] + } + ], + "source": [ + "import gensim.downloader as api\n", + "kv_model = api.load(\"glove-wiki-gigaword-300\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training the Model\n", + "Now that we have the preprocessed extracted data and word embeddings, training the model just takes one line:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-07-06 00:36:07,151 : INFO : Starting Vocab Build\n", + "2018-07-06 00:36:08,602 : INFO : Vocab Build Complete\n", + "2018-07-06 00:36:08,603 : INFO : Vocab Size is 18814\n", + "2018-07-06 00:36:08,605 : INFO : Building embedding index using KeyedVector pretrained word embeddings\n", + "2018-07-06 00:36:08,605 : INFO : The embeddings_index built from the given file has 400000 words of 300 dimensions\n", + "2018-07-06 00:36:08,606 : INFO : Building the Embedding Matrix for the model's Embedding Layer\n", + "2018-07-06 00:36:08,836 : INFO : There are 642 words out of 18814 (3.41%) not in the embeddings. Setting them to random\n", + "2018-07-06 00:36:08,836 : INFO : Adding additional words from the embedding file to embedding matrix\n", + "2018-07-06 00:36:10,775 : INFO : Normalizing the word embeddings\n", + "2018-07-06 00:36:59,403 : INFO : Embedding Matrix build complete. It now has shape (400644, 300)\n", + "2018-07-06 00:37:06,320 : INFO : Pad word has been set to index 400642\n", + "2018-07-06 00:37:06,815 : INFO : Unknown word has been set to index 400643\n", + "2018-07-06 00:37:06,901 : INFO : Embedding index build complete\n", + "2018-07-06 00:37:22,881 : INFO : Input is an iterable amd will be streamed\n", + "2018-07-06 00:38:24,108 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,203 : INFO : Layer (type) Output Shape Param # Connected to \n", + "2018-07-06 00:38:24,236 : INFO : ==================================================================================================\n", + "2018-07-06 00:38:24,342 : INFO : query (InputLayer) (None, 200) 0 \n", + "2018-07-06 00:38:24,343 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,343 : INFO : doc (InputLayer) (None, 200) 0 \n", + "2018-07-06 00:38:24,344 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,345 : INFO : embedding_1 (Embedding) (None, 200, 300) 120193200 query[0][0] \n", + "2018-07-06 00:38:24,346 : INFO : doc[0][0] \n", + "2018-07-06 00:38:24,346 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,432 : INFO : dot_1 (Dot) (None, 200, 200) 0 embedding_1[0][0] \n", + "2018-07-06 00:38:24,455 : INFO : embedding_1[1][0] \n", + "2018-07-06 00:38:24,456 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,478 : INFO : top_k_layer_1 (TopKLayer) (None, 200, 20) 0 dot_1[0][0] \n", + "2018-07-06 00:38:24,479 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,496 : INFO : dense_2 (Dense) (None, 200, 100) 2100 top_k_layer_1[0][0] \n", + "2018-07-06 00:38:24,517 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,518 : INFO : dense_3 (Dense) (None, 200, 1) 101 dense_2[0][0] \n", + "2018-07-06 00:38:24,519 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,553 : INFO : dropout_1 (Dropout) (None, 200, 1) 0 dense_3[0][0] \n", + "2018-07-06 00:38:24,554 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,554 : INFO : dense_1 (Dense) (None, 200, 1) 301 embedding_1[0][0] \n", + "2018-07-06 00:38:24,555 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,555 : INFO : reshape_2 (Reshape) (None, 200) 0 dropout_1[0][0] \n", + "2018-07-06 00:38:24,556 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,557 : INFO : reshape_1 (Reshape) (None, 200) 0 dense_1[0][0] \n", + "2018-07-06 00:38:24,557 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,558 : INFO : dot_2 (Dot) (None, 1) 0 reshape_2[0][0] \n", + "2018-07-06 00:38:24,575 : INFO : reshape_1[0][0] \n", + "2018-07-06 00:38:24,576 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:24,577 : INFO : reshape_3 (Reshape) (None, 1) 0 dot_2[0][0] \n", + "2018-07-06 00:38:24,577 : INFO : ==================================================================================================\n", + "2018-07-06 00:38:24,588 : INFO : Total params: 120,195,702\n", + "2018-07-06 00:38:24,588 : INFO : Trainable params: 2,502\n", + "2018-07-06 00:38:24,589 : INFO : Non-trainable params: 120,193,200\n", + "2018-07-06 00:38:24,590 : INFO : __________________________________________________________________________________________________\n", + "2018-07-06 00:38:33,655 : INFO : Found 14 unknown words. Set them to unknown word index : 400643\n", + "2018-07-06 00:38:53,729 : INFO : Found 90 unknown words. Set them to unknown word index : 400643\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/3\n", + "900/900 [==============================] - 85s 95ms/step - loss: 1.0646 - acc: 0.0197\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-07-06 00:41:17,729 : INFO : MAP: 0.55\n", + "2018-07-06 00:41:17,735 : INFO : nDCG@1 : 0.38\n", + "2018-07-06 00:41:17,740 : INFO : nDCG@3 : 0.54\n", + "2018-07-06 00:41:17,746 : INFO : nDCG@5 : 0.60\n", + "2018-07-06 00:41:17,751 : INFO : nDCG@10 : 0.66\n", + "2018-07-06 00:41:17,756 : INFO : nDCG@20 : 0.67\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 2/3\n", + "900/900 [==============================] - 84s 94ms/step - loss: 0.9310 - acc: 0.0321\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-07-06 00:42:46,586 : INFO : MAP: 0.61\n", + "2018-07-06 00:42:46,592 : INFO : nDCG@1 : 0.46\n", + "2018-07-06 00:42:46,597 : INFO : nDCG@3 : 0.61\n", + "2018-07-06 00:42:46,604 : INFO : nDCG@5 : 0.67\n", + "2018-07-06 00:42:46,616 : INFO : nDCG@10 : 0.71\n", + "2018-07-06 00:42:46,621 : INFO : nDCG@20 : 0.72\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 3/3\n", + "900/900 [==============================] - 85s 94ms/step - loss: 0.8035 - acc: 0.1486\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-07-06 00:44:15,788 : INFO : MAP: 0.62\n", + "2018-07-06 00:44:15,793 : INFO : nDCG@1 : 0.46\n", + "2018-07-06 00:44:15,800 : INFO : nDCG@3 : 0.60\n", + "2018-07-06 00:44:15,809 : INFO : nDCG@5 : 0.67\n", + "2018-07-06 00:44:15,815 : INFO : nDCG@10 : 0.71\n", + "2018-07-06 00:44:15,821 : INFO : nDCG@20 : 0.72\n" + ] + } + ], + "source": [ + "# Train the model\n", + "drmm_tks_model = DRMM_TKS(\n", + " queries=q_iterable, docs=d_iterable, labels=l_iterable, word_embedding=kv_model, epochs=3,\n", + " validation_data=[q_val_iterable, d_val_iterable, l_val_iterable], topk=20\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing the model on new data\n", + "\n", + "The testing of the data can be done on completely unseen data using `model.predict(queries, docs)` where\n", + "queries: list of list of words\n", + "docs: list of list of list of words" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "queries = [simple_preprocess(\"how are glacier caves formed\"),\n", + " simple_preprocess(\"What is AWS\")]\n", + "\n", + "docs = [[simple_preprocess(\"A partly submerged glacier cave on Perito Moreno Glacier\"),\n", + " simple_preprocess(\"A glacier cave is a cave formed within the ice of a glacier\")],\n", + " [simple_preprocess(\"AWS stands for Amazon Web Services\"),\n", + " simple_preprocess(\"AWS was established in 2001\"),\n", + " simple_preprocess(\"It is a cloud service\")]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The predict function returns the similarity between a query-document pair in a list format\n", + "\n", + "For example\n", + "```\n", + "queries = [q1, q2]\n", + "docs = [[d1_1, d1_2],\n", + " [d2_1, d2_2, d2_3]]\n", + "\n", + "model.predict(queries, docs)\n", + "\n", + "Output\n", + "------\n", + "q1-d1_1 similarity\n", + "q1-d1_2 similarity\n", + "q2-d2_1 similarity\n", + "q2-d2_2 similarity\n", + "q2-d2_3 similarity\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-07-06 00:46:33,249 : INFO : Found 0 unknown words. Set them to unknown word index : 400643\n", + "2018-07-06 00:46:33,283 : INFO : Found 0 unknown words. Set them to unknown word index : 400643\n", + "2018-07-06 00:46:33,778 : INFO : Predictions in the format query, doc, similarity\n", + "2018-07-06 00:46:33,800 : INFO : ['how', 'are', 'glacier', 'caves', 'formed']\t['partly', 'submerged', 'glacier', 'cave', 'on', 'perito', 'moreno', 'glacier']\t0.75623834\n", + "2018-07-06 00:46:33,801 : INFO : ['how', 'are', 'glacier', 'caves', 'formed']\t['glacier', 'cave', 'is', 'cave', 'formed', 'within', 'the', 'ice', 'of', 'glacier']\t0.88229656\n", + "2018-07-06 00:46:33,802 : INFO : ['what', 'is', 'aws']\t['aws', 'stands', 'for', 'amazon', 'web', 'services']\t0.5922452\n", + "2018-07-06 00:46:33,802 : INFO : ['what', 'is', 'aws']\t['aws', 'was', 'established', 'in']\t0.581025\n", + "2018-07-06 00:46:33,803 : INFO : ['what', 'is', 'aws']\t['it', 'is', 'cloud', 'service']\t0.65737\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[0.75623834],\n", + " [0.88229656],\n", + " [0.5922452 ],\n", + " [0.581025 ],\n", + " [0.65737 ]], dtype=float32)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "drmm_tks_model.predict(queries, docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As can be seen from the logs and results above, within each query-document group, the correct answer has the highest score\n", + "\n", + "For example,\n", + "In the first group\n", + "```\n", + "['how', 'are', 'glacier', 'caves', 'formed'] ['partly', 'submerged', 'glacier', 'cave', 'on', 'perito', 'moreno', 'glacier']\t0.7\n", + "['how', 'are', 'glacier', 'caves', 'formed'] ['glacier', 'cave', 'is', 'cave', 'formed', 'within', 'the', 'ice', 'of', 'glacier']\t0.8\n", + "```\n", + "\n", + "The correct answer, \"glacier cave is cave ...\" has the higher score as compared to the first answer\n", + "The same can be seen for the second part" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Testing on a test set\n", + "We can pass a whole dataset and get evaluations based on that. Let's try with the test set of WikiQA Corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "q_test_iterable = MyWikiIterable('query', os.path.join( 'experimental_data', 'WikiQACorpus', 'WikiQA-test.tsv'))\n", + "d_test_iterable = MyWikiIterable('doc', os.path.join('experimental_data', 'WikiQACorpus', 'WikiQA-test.tsv'))\n", + "l_test_iterable = MyWikiIterable('label', os.path.join('experimental_data', 'WikiQACorpus', 'WikiQA-test.tsv'))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-07-06 00:48:00,129 : INFO : Found 21 unknown words. Set them to unknown word index : 400643\n", + "2018-07-06 00:48:00,202 : INFO : Found 253 unknown words. Set them to unknown word index : 400643\n", + "2018-07-06 00:48:09,461 : INFO : MAP: 0.60\n", + "2018-07-06 00:48:09,523 : INFO : nDCG@1 : 0.47\n", + "2018-07-06 00:48:09,541 : INFO : nDCG@3 : 0.60\n", + "2018-07-06 00:48:09,567 : INFO : nDCG@5 : 0.66\n", + "2018-07-06 00:48:09,591 : INFO : nDCG@10 : 0.70\n", + "2018-07-06 00:48:09,607 : INFO : nDCG@20 : 0.71\n" + ] + } + ], + "source": [ + "drmm_tks_model.evaluate(q_test_iterable, d_test_iterable, l_test_iterable)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparing DRMM TKS with other models\n", + "\n", + "It would be good to get an idea of how our model works against some unsupervised models like word2vec and FastText.\n", + "For this, we will, given a query-document pair, we will get a vector for the query and document. We can get the similarity between them using the cosine similarity between their vectors.\n", + "\n", + "### For word2vec\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def cosine_similarity(vec1, vec2):\n", + " return np.dot(vec1, vec2)/(np.linalg.norm(vec1)* np.linalg.norm(vec2))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from gensim.models.experimental import mapk, mean_ndcg\n", + "\n", + "def eval_model(queries, docs, labels, model):\n", + " long_doc_list = []\n", + " long_label_list = []\n", + " long_query_list = []\n", + " doc_lens = []\n", + "\n", + " def sent2vec(sentence):\n", + " vec = np.zeros((model.vector_size))\n", + " for word in sentence:\n", + " if word in model:\n", + " vec += model[word]\n", + " return vec/len(sentence)\n", + " \n", + " for query, doc, label in zip(queries, docs, labels):\n", + " i = 0\n", + " for d, l in zip(doc, label):\n", + " if len(d) == 0 or len(query) == 0:\n", + " print(\"skipping query-doc pair due to no words in vocab\")\n", + " continue\n", + " long_query_list.append(sent2vec(query))\n", + " long_doc_list.append(sent2vec(d))\n", + " long_label_list.append(l)\n", + " i += 1\n", + " doc_lens.append(len(doc))\n", + "\n", + " doc_lens = np.array(doc_lens)\n", + "\n", + " predictions = []\n", + " for q, d in zip(long_query_list, long_doc_list):\n", + " predictions.append(cosine_similarity(q, d))\n", + "\n", + " Y_pred = []\n", + " Y_true = []\n", + " offset = 0\n", + "\n", + " for doc_size in doc_lens:\n", + " Y_pred.append(predictions[offset: offset + doc_size])\n", + " Y_true.append(long_label_list[offset: offset + doc_size])\n", + " offset += doc_size\n", + " \n", + " print(\"MAP: %.2f\"% mapk(Y_true, Y_pred))\n", + " for k in [1, 3, 5, 10, 20]:\n", + " print(\"nDCG@%d : %.2f \" % (k, mean_ndcg(Y_true, Y_pred, k=k)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "skipping query-doc pair due to no words in vocab\n", + "skipping query-doc pair due to no words in vocab\n", + "MAP: 0.58\n", + "nDCG@1 : 0.43 \n", + "nDCG@3 : 0.60 \n", + "nDCG@5 : 0.66 \n", + "nDCG@10 : 0.70 \n", + "nDCG@20 : 0.71 \n" + ] + } + ], + "source": [ + "eval_model(q_test_iterable, d_test_iterable, l_test_iterable, kv_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's compare that with our model" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-07-06 00:49:11,315 : INFO : Found 21 unknown words. Set them to unknown word index : 400643\n", + "2018-07-06 00:49:11,379 : INFO : Found 253 unknown words. Set them to unknown word index : 400643\n", + "2018-07-06 00:49:21,218 : INFO : MAP: 0.60\n", + "2018-07-06 00:49:21,229 : INFO : nDCG@1 : 0.47\n", + "2018-07-06 00:49:21,246 : INFO : nDCG@3 : 0.60\n", + "2018-07-06 00:49:21,263 : INFO : nDCG@5 : 0.66\n", + "2018-07-06 00:49:21,274 : INFO : nDCG@10 : 0.70\n", + "2018-07-06 00:49:21,286 : INFO : nDCG@20 : 0.71\n" + ] + } + ], + "source": [ + "drmm_tks_model.evaluate(q_test_iterable, d_test_iterable, l_test_iterable)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "While the accuracy isn't any better, it is worse, this is still a Work In Progress and we hope to improve it further soon." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving and loading the model\n", + "The trained model can be saved and loaded from memory for future use." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-07-06 00:44:16,527 : INFO : saving DRMM_TKS object under drmm_tks_model, separately None\n", + "2018-07-06 00:44:16,529 : INFO : storing np array 'vectors' to drmm_tks_model.word_embedding.vectors.npy\n", + "2018-07-06 00:45:09,654 : INFO : storing np array 'embedding_matrix' to drmm_tks_model.embedding_matrix.npy\n", + "2018-07-06 00:45:18,682 : INFO : not storing attribute model\n", + "2018-07-06 00:45:18,684 : INFO : not storing attribute _get_pair_list\n", + "2018-07-06 00:45:18,685 : INFO : not storing attribute _get_full_batch_iter\n", + "2018-07-06 00:45:18,687 : INFO : not storing attribute queries\n", + "2018-07-06 00:45:18,688 : INFO : not storing attribute docs\n", + "2018-07-06 00:45:18,690 : INFO : not storing attribute labels\n", + "2018-07-06 00:45:18,691 : INFO : not storing attribute pair_list\n", + "2018-07-06 00:45:36,062 : INFO : saved drmm_tks_model\n" + ] + } + ], + "source": [ + "drmm_tks_model.save('drmm_tks_model')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "del drmm_tks_model\n", + "drmm_tks_model = DRMM_TKS.load('drmm_tks_model')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gensim/models/experimental/__init__.py b/gensim/models/experimental/__init__.py new file mode 100644 index 0000000000..d9459d307a --- /dev/null +++ b/gensim/models/experimental/__init__.py @@ -0,0 +1,7 @@ +"""This package will host some experimental modules for Similarity Learning""" + +from .drmm_tks import DRMM_TKS # noqa:F401 +from .custom_losses import rank_hinge_loss # noqa:F401 +from .custom_layers import TopKLayer # noqa:F401 +from .custom_callbacks import ValidationCallback # noqa:F401 +from .evaluation_metrics import mean_ndcg, mapk # noqa:F401 diff --git a/gensim/models/experimental/custom_callbacks.py b/gensim/models/experimental/custom_callbacks.py new file mode 100644 index 0000000000..737fa21f02 --- /dev/null +++ b/gensim/models/experimental/custom_callbacks.py @@ -0,0 +1,72 @@ +import logging +try: + from keras.callbacks import Callback + KERAS_AVAILABLE = True +except ImportError: + KERAS_AVAILABLE = False + +logger = logging.getLogger(__name__) +logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + + +class ValidationCallback(Callback): + """Callback for providing validation metrics on the model trained so far""" + def __init__(self, test_data): + """ + Parameters + ---------- + test_data : dict + A dictionary which holds the validation data. It consists of the following keys: + - "X1" : numpy array + The queries as a numpy array of shape (n_samples, text_maxlen) + - "X2" : numpy array + The candidate docs as a numpy array of shape (n_samples, text_maxlen) + - "y" : list of int + It is the labels for each of the query-doc pairs as a 1 or 0 with shape (n_samples,) + where 1 : doc is relevant to query, 0 : doc is not relevant to query + - "doc_lengths" : list of int + It contains the length of each document group. I.e., the number of queries + which represent one topic. It is needed for calculating the metrics. + + """ + + if not KERAS_AVAILABLE: + raise ImportError("Please install Keras to use this class") + + # Check if all test_data is a dicitonary with all the right keys + try: + # If an empty dict is passed + if len(test_data.keys()) == 0: + raise ValueError( + "test_data dictionary is empty. It doesn't have the keys: 'X1', 'X2', 'y', 'doc_lengths'" + ) + for key in test_data.keys(): + if key not in ['X1', 'X2', 'y', 'doc_lengths']: + raise ValueError("test_data dictionary doesn't have the keys: 'X1', 'X2', 'y', 'doc_lengths'") + except AttributeError: + raise ValueError("test_data must be a dictionary with the keys: 'X1', 'X2', 'y', 'doc_lengths'") + self.test_data = test_data + + def on_epoch_end(self, epoch, logs={}): + # Import has to be here to prevent cyclic import + from evaluation_metrics import mapk, mean_ndcg + X1 = self.test_data["X1"] + X2 = self.test_data["X2"] + y = self.test_data["y"] + doc_lengths = self.test_data["doc_lengths"] + + predictions = self.model.predict(x={"query": X1, "doc": X2}) + + Y_pred = [] + Y_true = [] + offset = 0 + + for doc_size in doc_lengths: + Y_pred.append(predictions[offset: offset + doc_size]) + Y_true.append(y[offset: offset + doc_size]) + offset += doc_size + + logger.info("MAP: %.2f", mapk(Y_true, Y_pred)) + for k in [1, 3, 5, 10, 20]: + logger.info("nDCG@%d : %.2f", k, mean_ndcg(Y_true, Y_pred, k=k)) diff --git a/gensim/models/experimental/custom_layers.py b/gensim/models/experimental/custom_layers.py new file mode 100644 index 0000000000..d1959e4ba2 --- /dev/null +++ b/gensim/models/experimental/custom_layers.py @@ -0,0 +1,42 @@ +try: + from keras.engine.topology import Layer + import keras.backend as K + KERAS_AVAILABLE = True +except ImportError: + KERAS_AVAILABLE = False + +"""Script where all the custom keras layers are kept.""" + + +class TopKLayer(Layer): + """Layer to get top k values from the interaction matrix in drmm_tks model""" + def __init__(self, output_dim, topk, **kwargs): + """ + + Parameters + ---------- + output_dim : tuple of int + The dimension of the tensor after going through this layer. + topk : int + The k topmost values to be returned. + """ + self.output_dim = output_dim + self.topk = topk + super(TopKLayer, self).__init__(**kwargs) + + def build(self, input_shape): + super(TopKLayer, self).build(input_shape) + + def call(self, x): + return K.tf.nn.top_k(x, k=self.topk, sorted=True)[0] + + def compute_output_shape(self, input_shape): + return (input_shape[0], self.output_dim[0], self.topk) + + def get_config(self): + config = { + 'topk': self.topk, + 'output_dim': self.output_dim + } + base_config = super(TopKLayer, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/gensim/models/experimental/custom_losses.py b/gensim/models/experimental/custom_losses.py new file mode 100644 index 0000000000..1ad037b62c --- /dev/null +++ b/gensim/models/experimental/custom_losses.py @@ -0,0 +1,29 @@ +try: + from keras import backend as K + from keras.layers import Lambda + KERAS_AVAILABLE = True +except ImportError: + KERAS_AVAILABLE = False + +"""Script where all the custom loss functions will be defined""" + + +def rank_hinge_loss(y_true, y_pred): + """Loss function for Ranking Similarity Learning tasks + More details here : https://en.wikipedia.org/wiki/Hinge_loss + + Parameters + ---------- + y_true : list of list of int + The true relation between a query and a doc + It can be either 1 : relevant or 0 : not relevant + y_pred : list of list of float + The predicted relation between a query and a doc + """ + if not KERAS_AVAILABLE: + raise ImportError("Please install Keras to use this function") + margin = 1 + y_pos = Lambda(lambda a: a[::2, :], output_shape=(1,))(y_pred) + y_neg = Lambda(lambda a: a[1::2, :], output_shape=(1,))(y_pred) + loss = K.maximum(0., margin + y_neg - y_pos) + return K.mean(loss) diff --git a/gensim/models/experimental/drmm_tks.py b/gensim/models/experimental/drmm_tks.py new file mode 100644 index 0000000000..894dcaa319 --- /dev/null +++ b/gensim/models/experimental/drmm_tks.py @@ -0,0 +1,843 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Author: Aneesh Joshi +# Copyright (C) 2018 RaRe Technologies s.r.o. +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +"""This module makes a trainable and usable model for getting similarity between documents using the DRMM_TKS model. + +Once the model is trained with the query-candidate-relevance data, the model can provide a vector for each new +document which is entered into it. The similarity between any 2 documents can then be measured using the +cosine similarty between the vectors. + +Abbreviations +============= +- DRMM : Deep Relevance Matching Model +- TKS : Top K Solutions + +About DRMM_TKS +============== +This is a variant version of DRMM, which applied topk pooling in the matching matrix. +It has the following steps: + +1. embed queries and docs into embedding vector named `q_embed` and `d_embed` respectively. +2. computing `q_embed` and `d_embed` with element-wise multiplication. +3. computing output of upper layer with dense layer operation. +4. take softmax operation on the output of this layer named `g` and find the k largest entries named `mm_k`. +5. input `mm_k` into hidden layers, with specified length of layers and activation function. +6. compute `g` and `mm_k` with element-wise multiplication. + +On predicting, the model returns the score list between queries and documents. + +The trained model needs to be trained on data in the format: + +>>> from gensim.models.experimental import DRMM_TKS +>>> import gensim.downloader as api +>>> queries = ["When was World War 1 fought ?".lower().split(), "When was Gandhi born ?".lower().split()] +>>> docs = [["The world war was bad".lower().split(), "It was fought in 1996".lower().split()], ["Gandhi was born in" +... "the 18th century".lower().split(), "He fought for the Indian freedom movement".lower().split(), +... "Gandhi was assasinated".lower().split()]] +>>> labels = [[0, 1], [1, 0, 0]] +>>> word_embeddings_kv = api.load('glove-wiki-gigaword-50') +>>> model = DRMM_TKS(queries, docs, labels, word_embedding=word_embeddings_kv, verbose=0) + +Persist a model to disk with : + +>>> from gensim.test.utils import get_tmpfile +>>> file_path = get_tmpfile('DRMM_TKS.model') +>>> model.save(file_path) +>>> model = DRMM_TKS.load(file_path) + +You can also create the modela and train it later : + +>>> model = DRMM_TKS() +>>> model.train(queries, docs, labels, word_embeddings_kv, epochs=12, verbose=0) + +Testing on new data : + +>>> from gensim.test.utils import datapath +>>> model = DRMM_TKS.load(datapath('drmm_tks')) +>>> +>>> queries = ["how are glacier caves formed ?".lower().split()] +>>> docs = [["A partly submerged glacier cave on Perito Moreno Glacier".lower().split(), "glacier cave is cave formed" +... " within the ice of glacier".lower().split()]] +>>> print(model.predict(queries, docs)) +[[0.9915068 ] + [0.99228466]] +>>> print(model.predict([["hello", "world"]], [[["i", "am", "happy"], ["good", "morning"]]])) +[[0.9975487] + [0.999115 ]] + + +More information can be found in: +`Jiafeng Guo, Yixing Fan, Qingyao Ai, W. Bruce Croft "A Deep Relevance Matching Model for Ad-hoc Retrieval" +`_ +`MatchZoo Repository `_ +`Similarity Learning Wikipedia Page `_ + +""" + +import logging +import numpy as np +import hashlib +from numpy import random as np_random +from gensim.models import KeyedVectors +from collections import Counter +from gensim.models.experimental.custom_losses import rank_hinge_loss +from gensim.models.experimental.custom_layers import TopKLayer +from gensim.models.experimental.custom_callbacks import ValidationCallback +from gensim.models.experimental.evaluation_metrics import mapk, mean_ndcg +from sklearn.preprocessing import normalize +from gensim import utils +from collections import Iterable + +try: + import keras.backend as K + from keras import optimizers + from keras.models import load_model + from keras.losses import hinge + from keras.models import Model + from keras.layers import Input, Embedding, Dot, Dense, Reshape, Dropout + KERAS_AVAILABLE = True +except ImportError: + KERAS_AVAILABLE = False + +logger = logging.getLogger(__name__) + + +def _get_full_batch_iter(pair_list, batch_size): + """Provides all the data points int the format: X1, X2, y with + alternate positive and negative examples of `batch_size` in a streamable format. + + Parameters + ---------- + pair_list : iterable list of tuple + See docstring for _get_pair_list for more details + batch_size : int + half the size in which the generator will yield datapoints. The size is doubled since + we include positive and negative examples. + + Yields + ------- + X1 : numpy array of shape (batch_size * 2, text_maxlen) + the queries + X2 : numpy array of shape (batch_size * 2, text_maxlen) + the docs + y : numpy array with {0, 1} of shape (batch_size * 2, 1) + The relation between X1[i] and X2[j] + 1 : X2[i] is relevant to X1[i] + 0 : X2[i] is not relevant to X1[i] + """ + + X1, X2, y = [], [], [] + while True: + for i, (query, pos_doc, neg_doc) in enumerate(pair_list): + X1.append(query) + X2.append(pos_doc) + y.append(1) + X1.append(query) + X2.append(neg_doc) + y.append(0) + if i % batch_size == 0 and i != 0: + yield ({'query': np.array(X1), 'doc': np.array(X2)}, np.array(y)) + X1, X2, y = [], [], [] + + +def _get_pair_list(queries, docs, labels, _make_indexed, is_iterable): + """Yields a tuple with query document pairs in the format + (query, positive_doc, negative_doc) + [(q1, d+, d-), (q2, d+, d-), (q3, d+, d-), ..., (qn, d+, d-)] + where each query or document is a list of int + + Parameters + ---------- + queries : iterable list of list of str + The queries to the model + docs : iterable list of list of list of str + The candidate documents for each query + labels : iterable list of int + The relevance of the document to the query. 1 = relevant, 0 = not relevant + _make_indexed : function + Translates the given sentence as a list of list of str into a list of list of int + based on the model's internal dictionary + is_iterable : bool + Whether the input data is streamable + + Example + ------- + [(['When', 'was', 'Abraham', 'Lincoln', 'born', '?'], + ['He', 'was', 'born', 'in', '1809'], + ['Abraham', 'Lincoln', 'was', 'the', 'president', + 'of', 'the', 'United', 'States', 'of', 'America']), + + (['When', 'was', 'the', 'first', 'World', 'War', '?'], + ['It', 'was', 'fought', 'in', '1914'], + ['There', 'were', 'over', 'a', 'million', 'deaths']), + + (['When', 'was', 'the', 'first', 'World', 'War', '?'], + ['It', 'was', 'fought', 'in', '1914'], + ['The', 'first', 'world', 'war', 'was', 'bad']) + ] + + """ + if is_iterable: + while True: + j=0 + for q, doc, label in zip(queries, docs, labels): + doc, label = (list(t) for t in zip(*sorted(zip(doc, label), reverse=True))) + for item in zip(doc, label): + if item[1] == 1: + for new_item in zip(doc, label): + if new_item[1] == 0: + j+=1 + yield(_make_indexed(q), _make_indexed(item[0]), _make_indexed(new_item[0])) + else: + for q, doc, label in zip(queries, docs, labels): + doc, label = (list(t) for t in zip(*sorted(zip(doc, label), reverse=True))) + for item in zip(doc, label): + if item[1] == 1: + for new_item in zip(doc, label): + if new_item[1] == 0: + yield(_make_indexed(q), _make_indexed(item[0]), _make_indexed(new_item[0])) + + +class DRMM_TKS(utils.SaveLoad): + """Model for training a Similarity Learning Model using the DRMM TKS model. + You only have to provide sentences in the data as a list of words. + """ + + def __init__(self, queries=None, docs=None, labels=None, word_embedding=None, + text_maxlen=200, normalize_embeddings=True, epochs=10, unk_handle_method='random', + validation_data=None, topk=50, target_mode='ranking', verbose=1): + """Initializes the model and trains it + + Parameters + ---------- + queries: iterable list of list of string words, optional + The questions for the similarity learning model. + docs: iterable list of list of list of string words, optional + The candidate answers for the similarity learning model. + labels: iterable list of list of int, optional + Indicates when a candidate document is relevant to a query + - 1 : relevant + - 0 : irrelevant + word_embedding : :class:`~gensim.models.keyedvectors.KeyedVectors`, optional + a KeyedVector object which has the embeddings pre-loaded. + If None, random word embeddings will be used. + text_maxlen : int, optional + The maximum possible length of a query or a document. + This is used for padding sentences. + normalize_embeddings : bool, optional + Whether the word embeddings provided should be normalized. + epochs : int, optional + The number of epochs for which the model should train on the data. + unk_handle_method : {'zero', 'random'}, optional + The method for handling unkown words. + - 'zero' : unknown words are given a zero vector + - 'random' : unknown words are given a uniformly random vector bassed on the word string hash + validation_data: list of the form [test_queries, test_docs, test_labels], optional + where test_queries, test_docs and test_labels are of the same form as + their counter parts stated above. + topk : int, optional + the k topmost values in the interaction matrix between the queries and the docs + target_mode : {'ranking', 'classification'}, optional + the way the model should be trained, either to rank or classify + verbose : {0, 1, 2} + the level of information shared while training + - 0 : silent + - 1 : progress bar + - 2 : one line per epoch + + + Examples + -------- + The trained model needs to be trained on data in the format + + >>> queries = ["When was World War 1 fought ?".lower().split(), "When was Gandhi born ?".lower().split()] + >>> docs = [["The world war was bad".lower().split(), "It was fought in 1996".lower().split()], ["Gandhi was" + ... "born in the 18th century".lower().split(), "He fought for the Indian freedom movement".lower().split(), + ... "Gandhi was assasinated".lower().split()]] + >>> labels = [[0, 1], [1, 0, 0]] + >>> import gensim.downloader as api + >>> word_embeddings_kv = api.load('glove-wiki-gigaword-50') + >>> model = DRMM_TKS(queries, docs, labels, word_embedding=word_embeddings_kv, verbose=0) + """ + self.queries = queries + self.docs = docs + self.labels = labels + self.word_counter = Counter() + self.text_maxlen = text_maxlen + self.topk = topk + self.word_embedding = word_embedding + self.word2index, self.index2word = {}, {} + self.normalize_embeddings = normalize_embeddings + self.model = None + self.epochs = epochs + self.validation_data = validation_data + self.target_mode = target_mode + self.verbose = verbose + self.first_train = True # Whether the model has been trained before + self.needs_vocab_build = True + + # These functions have been defined outside the class and set as attributes here + # so that they can be ignored when saving the model to file + self._get_pair_list = _get_pair_list + self._get_full_batch_iter = _get_full_batch_iter + + if self.target_mode not in ['ranking', 'classification']: + raise ValueError( + "Unkown target_mode %s. It must be either 'ranking' or 'classification'" % self.target_mode + ) + + if unk_handle_method not in ['random', 'zero']: + raise ValueError("Unkown token handling method %s" % str(unk_handle_method)) + self.unk_handle_method = unk_handle_method + + if self.queries is not None and self.docs is not None and self.labels is not None: + self.build_vocab(self.queries, self.docs, self.labels, self.word_embedding) + self.train(self.queries, self.docs, self.labels, self.word_embedding, + self.text_maxlen, self.normalize_embeddings, self.epochs, self.unk_handle_method, + self.validation_data, self.topk, self.target_mode, self.verbose) + + def build_vocab(self, queries, docs, labels, word_embedding): + """Indexes all the words and makes an embedding_matrix which + can be fed directly into an Embedding layer + """ + + logger.info("Starting Vocab Build") + + # get all the vocab words + for q in self.queries: + self.word_counter.update(q) + for doc in self.docs: + for d in doc: + self.word_counter.update(d) + for i, word in enumerate(self.word_counter.keys()): + self.word2index[word] = i + self.index2word[i] = word + + self.vocab_size = len(self.word2index) + logger.info("Vocab Build Complete") + logger.info("Vocab Size is %d", self.vocab_size) + + logger.info("Building embedding index using KeyedVector pretrained word embeddings") + if type(self.word_embedding) == KeyedVectors: + kv_model = self.word_embedding + embedding_vocab_size, self.embedding_dim = len(kv_model.vocab), kv_model.vector_size + else: + raise ValueError( + "Unknown value of word_embedding : %s. Must be either a KeyedVector object", + str(word_embedding) + ) + + logger.info( + "The embeddings_index built from the given file has %d words of %d dimensions", + embedding_vocab_size, self.embedding_dim + ) + + logger.info("Building the Embedding Matrix for the model's Embedding Layer") + + # Initialize the embedding matrix + # UNK word gets the vector based on the method + if self.unk_handle_method == 'random': + self.embedding_matrix = np.random.uniform(-0.2, 0.2, (self.vocab_size, self.embedding_dim)) + elif self.unk_handle_method == 'zero': + self.embedding_matrix = np.zeros((self.vocab_size, self.embedding_dim)) + + n_non_embedding_words = 0 + for word, i in self.word2index.items(): + if word in kv_model: + # words not found in keyed vectors will get the vector based on unk_handle_method + self.embedding_matrix[i] = kv_model[word] + else: + if self.unk_handle_method == 'random': + # Creates the same random vector for the given string each time + self.embedding_matrix[i] = self._seeded_vector(word, self.embedding_dim) + n_non_embedding_words += 1 + logger.info( + "There are %d words out of %d (%.2f%%) not in the embeddings. Setting them to %s", n_non_embedding_words, + self.vocab_size, n_non_embedding_words * 100 / self.vocab_size, self.unk_handle_method + ) + + # Include embeddings for words in embedding file but not in the train vocab + # It will be useful for embedding words encountered in validation and test set + logger.info( + "Adding additional words from the embedding file to embedding matrix" + ) + + # The point where vocab words end + vocab_offset = self.vocab_size + extra_embeddings = [] + # Take the words in the embedding file which aren't there int the train vocab + for word in list(kv_model.vocab): + if word not in self.word2index: + # Add the new word's vector and index it + extra_embeddings.append(kv_model[word]) + # We also need to keep an additional indexing of these + # words + self.word2index[word] = vocab_offset + vocab_offset += 1 + + # Set the pad and unk word to second last and last index + self.pad_word_index = vocab_offset + self.unk_word_index = vocab_offset + 1 + + if self.unk_handle_method == 'random': + unk_embedding_row = np.random.uniform(-0.2, 0.2, (1, self.embedding_dim)) + elif self.unk_handle_method == 'zero': + unk_embedding_row = np.zeros((1, self.embedding_dim)) + + pad_embedding_row = np.random.uniform(-0.2, + 0.2, (1, self.embedding_dim)) + + if len(extra_embeddings) > 0: + self.embedding_matrix = np.vstack( + [self.embedding_matrix, np.array(extra_embeddings), + pad_embedding_row, unk_embedding_row] + ) + else: + self.embedding_matrix = np.vstack( + [self.embedding_matrix, pad_embedding_row, unk_embedding_row] + ) + + if self.normalize_embeddings: + logger.info("Normalizing the word embeddings") + self.embedding_matrix = normalize(self.embedding_matrix) + + logger.info("Embedding Matrix build complete. It now has shape %s", str(self.embedding_matrix.shape)) + logger.info("Pad word has been set to index %d", self.pad_word_index) + logger.info("Unknown word has been set to index %d", self.unk_word_index) + logger.info("Embedding index build complete") + self.needs_vocab_build = False + + def _string2numeric_hash(self, text): + "Gets a numeric hash for a given string" + return int(hashlib.md5(text.encode()).hexdigest()[:8], 16) + + def _seeded_vector(self, seed_string, vector_size): + """Create one 'random' vector (but deterministic by seed_string)""" + # Note: built-in hash() may vary by Python version or even (in Py3.x) per launch + once = np_random.RandomState(self._string2numeric_hash(seed_string) & 0xffffffff) + return (once.rand(vector_size) - 0.5) / vector_size + + def _make_indexed(self, sentence): + """Gets the indexed version of the sentence based on the self.word2index dict + in the form of a list + + This function should never encounter any OOV words since it only indexes + in vocab words + + Parameters + ---------- + sentence : iterable list of list of str + The sentence to be indexed + + Raises + ------ + ValueError : If the sentence has a lenght more than text_maxlen + """ + + indexed_sent = [] + for word in sentence: + indexed_sent.append(self.word2index[word]) + + if len(indexed_sent) > self.text_maxlen: + raise ValueError( + "text_maxlen: %d isn't big enough. Error at sentence of length %d." + "Sentence is %s" % (self.text_maxlen, len(sentence), sentence) + ) + indexed_sent = indexed_sent + [self.pad_word_index] * (self.text_maxlen - len(indexed_sent)) + return indexed_sent + + def _get_full_batch(self): + """Provides all the data points int the format: X1, X2, y with + alternate positive and negative examples + + Returns + ------- + X1 : numpy array of shape (num_samples, text_maxlen) + the queries + X2 : numpy array of shape (num_samples, text_maxlen) + the docs + y : numpy array with {0, 1} of shape (num_samples,) + The relation between X1[i] and X2[j] + 1 : X2[i] is relevant to X1[i] + 0 : X2[i] is not relevant to X1[i] + """ + X1, X2, y = [], [], [] + for i, (query, pos_doc, neg_doc) in enumerate(self.pair_list): + X1.append(query) + X2.append(pos_doc) + y.append(1) + X1.append(query) + X2.append(neg_doc) + y.append(0) + return np.array(X1), np.array(X2), np.array(y) + + def train(self, queries, docs, labels, word_embedding=None, + text_maxlen=200, normalize_embeddings=True, epochs=10, unk_handle_method='zero', + validation_data=None, topk=20, target_mode='ranking', verbose=1, batch_size=5, steps_per_epoch=900): + """Trains a DRMM_TKS model using specified parameters + + This method is called from on model initialization if the data is provided. + It can also be trained in an online manner or after initialization + """ + + self.queries = queries or self.queries + self.docs = docs or self.docs + self.labels = labels or self.labels + + # This won't change the embedding layer TODO + self.word_embedding = word_embedding or self.word_embedding + self.text_maxlen = text_maxlen or self.text_maxlen + self.normalize_embeddings = normalize_embeddings or self.normalize_embeddings + self.epochs = epochs or self.epochs + self.unk_handle_method = unk_handle_method or self.unk_handle_method + self.validation_data = validation_data or self.validation_data + self.topk = topk or self.topk + self.target_mode = target_mode or self.target_mode + + if verbose != 0: # Check needed since 0 or 2 will always give 2 + self.verbose = verbose or self.verbose + else: + self.verbose = 0 + + if self.queries is None or self.docs is None or self.labels is None: + raise ValueError("queries, docs and labels have to be specified") + # We need to build these each time since any of the parameters can change from each train to trian + if self.needs_vocab_build: + self.build_vocab(self.queries, self.docs, self.labels, self.word_embedding) + + is_iterable = False + if isinstance(self.queries, Iterable) and not isinstance(self.queries, list): + is_iterable = True + logger.info("Input is an iterable amd will be streamed") + + self.pair_list = self._get_pair_list(self.queries, self.docs, self.labels, self._make_indexed, is_iterable) + if is_iterable: + train_generator = self._get_full_batch_iter(self.pair_list, 10) + else: + X1_train, X2_train, y_train = self._get_full_batch() + + if self.first_train: + # The settings below should be set only once + self.model = self._get_keras_model() + optimizer = 'adam' + optimizer = 'adadelta' + optimizer = optimizers.get(optimizer) + learning_rate = 0.0001 + learning_rate = 1 + K.set_value(optimizer.lr, learning_rate) + # either one can be selected. Currently, the choice is manual. + loss = hinge + loss = 'mse' + loss = rank_hinge_loss + self.model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) + else: + logger.info("Model will be retrained") + + self.model.summary(print_fn=logger.info) + + # Put the validation data in as a callback + val_callback = None + if self.validation_data is not None: + test_queries, test_docs, test_labels = self.validation_data + + long_doc_list = [] + long_label_list = [] + long_query_list = [] + doc_lens = [] + + for query, doc, label in zip(test_queries, test_docs, test_labels): + i = 0 + for d, l in zip(doc, label): + long_query_list.append(query) + long_doc_list.append(d) + long_label_list.append(l) + i += 1 + doc_lens.append(len(doc)) + + indexed_long_query_list = self._translate_user_data(long_query_list) + indexed_long_doc_list = self._translate_user_data(long_doc_list) + + val_callback = ValidationCallback( + {"X1": indexed_long_query_list, "X2": indexed_long_doc_list, "doc_lengths": doc_lens, + "y": long_label_list} + ) + val_callback = [val_callback] # since `model.fit` requires a list + + # If train is called again, not all values should be reset + if self.first_train is True: + self.first_train = False + + if is_iterable: + self.model.fit_generator(train_generator, steps_per_epoch=steps_per_epoch, callbacks=val_callback, + epochs=self.epochs, shuffle=False, ) + else: + self.model.fit(x={"query": X1_train, "doc": X2_train}, y=y_train, batch_size=5, + verbose=self.verbose, epochs=self.epochs, shuffle=False, callbacks=val_callback) + + def _translate_user_data(self, data): + """Translates given user data into an indexed format which the model understands. + If a model is not in the vocabulary, it is assigned the `unk_word_index` which maps + to the unk vector decided by `unk_handle_method` + + Parameters + ---------- + data : list of list of string words + The data to be tranlsated + + Examples + -------- + >>> from gensim.test.utils import datapath + >>> model = DRMM_TKS.load(datapath('drmm_tks')) + >>> + >>> queries = ["When was World War 1 fought ?".split(), "When was Gandhi born ?".split()] + >>> print(model._translate_user_data(queries)) + [[31 1 23 31 4 5 6 30 30 30] + [31 1 31 8 6 30 30 30 30 30]] + """ + translated_data = [] + n_skipped_words = 0 + for sentence in data: + translated_sentence = [] + for word in sentence: + if word in self.word2index: + translated_sentence.append(self.word2index[word]) + else: + # If the key isn't there give it the zero word index + translated_sentence.append(self.unk_word_index) + n_skipped_words += 1 + if len(sentence) > self.text_maxlen: + logger.info( + "text_maxlen: %d isn't big enough. Error at sentence of length %d." + "Sentence is %s", self.text_maxlen, len(sentence), str(sentence) + ) + translated_sentence = translated_sentence + (self.text_maxlen - len(sentence)) * [self.pad_word_index] + translated_data.append(np.array(translated_sentence)) + + logger.info( + "Found %d unknown words. Set them to unknown word index : %d", n_skipped_words, self.unk_word_index + ) + return np.array(translated_data) + + def predict(self, queries, docs): + """Predcits the similarity between a query-document pair + based on the trained DRMM TKS model + + Parameters + ---------- + queries : list of list of str + The questions for the similarity learning model + docs : list of list of list of str + The candidate answers for the similarity learning model + + + Examples + -------- + >>> from gensim.test.utils import datapath + >>> model = DRMM_TKS.load(datapath('drmm_tks')) + >>> + >>> queries = ["When was World War 1 fought ?".split(), "When was Gandhi born ?".split()] + >>> docs = [["The world war was bad".split(), "It was fought in 1996".split()], ["Gandhi was born in the 18th" + ... " century".split(), "He fought for the Indian freedom movement".split(), "Gandhi was" + ... " assasinated".split()]] + >>> print(model.predict(queries, docs)) + [[0.9933108 ] + [0.9925415 ] + [0.9827911 ] + [0.99258184] + [0.9960481 ]] + """ + + long_query_list = [] + long_doc_list = [] + for query, doc in zip(queries, docs): + for d in doc: + long_query_list.append(query) + long_doc_list.append(d) + + indexed_long_query_list = self._translate_user_data(long_query_list) + indexed_long_doc_list = self._translate_user_data(long_doc_list) + + predictions = self.model.predict(x={'query': indexed_long_query_list, 'doc': indexed_long_doc_list}) + + logger.info("Predictions in the format query, doc, similarity") + for i, (q, d) in enumerate(zip(long_query_list, long_doc_list)): + logger.info("%s\t%s\t%s", str(q), str(d), str(predictions[i][0])) + + return predictions + + def evaluate(self, queries, docs, labels): + """Evaluates the model and provides the results in terms of metrics (MAP, nDCG) + This should ideally be called on the test set. + + Parameters + ---------- + queries : list of list of str + The questions for the similarity learning model + docs : list of list of list of str + The candidate answers for the similarity learning model + labels : list of list of int + The relevance of the document to the query. 1 = relevant, 0 = not relevant + """ + long_doc_list = [] + long_label_list = [] + long_query_list = [] + doc_lens = [] + for query, doc, label in zip(queries, docs, labels): + i = 0 + for d, l in zip(doc, label): + long_query_list.append(query) + long_doc_list.append(d) + long_label_list.append(l) + i += 1 + doc_lens.append(len(doc)) + indexed_long_query_list = self._translate_user_data(long_query_list) + indexed_long_doc_list = self._translate_user_data(long_doc_list) + predictions = self.model.predict(x={'query': indexed_long_query_list, 'doc': indexed_long_doc_list}) + Y_pred = [] + Y_true = [] + offset = 0 + for doc_size in doc_lens: + Y_pred.append(predictions[offset: offset + doc_size]) + Y_true.append(long_label_list[offset: offset + doc_size]) + offset += doc_size + logger.info("MAP: %.2f", mapk(Y_true, Y_pred)) + for k in [1, 3, 5, 10, 20]: + logger.info("nDCG@%d : %.2f", k, mean_ndcg(Y_true, Y_pred, k=k)) + + def save(self, fname, *args, **kwargs): + """Save the model. + This saved model can be loaded again using :func:`~gensim.models.experimental.drmm_tks.DRMM_TKS.load` + The keras model shouldn't be serialized using pickle or cPickle. So, the non-keras + variables will be saved using gensim's SaveLoad and the keras model will be saved using + the keras save method with ".keras" prefix. + + Also see :func:`~gensim.models.experimental.drmm_tks.DRMM_TKS.load` + + Parameters + ---------- + fname : str + Path to the file. + + Examples + -------- + >>> from gensim.test.utils import datapath, get_tmpfile + >>> model = DRMM_TKS.load(datapath('drmm_tks')) + >>> model_save_path = get_tmpfile('drmm_tks_model') + >>> model.save(model_save_path) + """ + # don't save the keras model as it needs to be saved with a keras function + # Also, we can't save iterable properties. So, ignore them. + kwargs['ignore'] = kwargs.get( + 'ignore', ['model', '_get_pair_list', '_get_full_batch_iter', + 'queries', 'docs', 'labels', 'pair_list']) + kwargs['fname_or_handle'] = fname + super(DRMM_TKS, self).save(*args, **kwargs) + self.model.save(fname + ".keras") + + @classmethod + def load(cls, *args, **kwargs): + """Loads a previously saved `DRMM TKS` model. Also see `save()`. + Collects the gensim and the keras models and returns it as on gensim model. + + Parameters + ---------- + fname : str + Path to the saved file. + + Returns + ------- + :obj: `~gensim.models.experimental.DRMM_TKS` + Returns the loaded model as an instance of :class: `~gensim.models.experimental.DRMM_TKS`. + + + Examples + -------- + >>> from gensim.test.utils import datapath, get_tmpfile + >>> model_file_path = datapath('drmm_tks') + >>> model = DRMM_TKS.load(model_file_path) + """ + fname = args[0] + gensim_model = super(DRMM_TKS, cls).load(*args, **kwargs) + keras_model = load_model( + fname + '.keras', custom_objects={'TopKLayer': TopKLayer}) + gensim_model.model = keras_model + gensim_model._get_pair_list = _get_pair_list + gensim_model._get_full_batch_iter = _get_full_batch_iter + return gensim_model + + def _get_keras_model(self, embed_trainable=False, dropout_rate=0.5, hidden_sizes=[100, 1]): + """Builds and returns the keras class for drmm tks model + + About DRMM_TKS + -------------- + This is a variant version of DRMM, which applied topk pooling in the matching matrix. + It has the following steps: + 1. embed queries into embedding vector named 'q_embed' and 'd_embed' respectively + 2. computing 'q_embed' and 'd_embed' with element-wise multiplication + 3. computing output of upper layer with dense layer operation + 4. take softmax operation on the output of this layer named 'g' and find the k largest entries named 'mm_k'. + 5. input 'mm_k' into hidden layers, with specified length of layers and activation function + 6. compute 'g' and 'mm_k' with element-wise multiplication. + + On predicting, the model returns the score list between queries and documents. + + Parameters + ---------- + embed_trainable : bool, optional + Whether the embeddings should be trained + if True, the embeddings are trianed + dropout_rate : float between 0 and 1, optional + The probability of making a neuron dead + Used for regularization. + hidden_sizes : list of int, optional + The list of hidden sizes for the fully connected layers connected to the matching matrix + Example : + hidden_sizes = [10, 20, 30] + will add 3 fully connected layers of 10, 20 and 30 hidden neurons + + """ + + if not KERAS_AVAILABLE: + raise ImportError("Please install Keras to use this model") + + n_layers = len(hidden_sizes) + + query = Input(name='query', shape=(self.text_maxlen,)) + doc = Input(name='doc', shape=(self.text_maxlen,)) + embedding = Embedding(self.embedding_matrix.shape[0], self.embedding_dim, + weights=[self.embedding_matrix], trainable=embed_trainable) + + q_embed = embedding(query) + d_embed = embedding(doc) + + mm = Dot(axes=[2, 2], normalize=True)([q_embed, d_embed]) + + # compute term gating + w_g = Dense(1, activation='softmax')(q_embed) + g = Reshape((self.text_maxlen,))(w_g) + + mm_k = TopKLayer(topk=self.topk, output_dim=( + self.text_maxlen, self.embedding_dim))(mm) + + for i in range(n_layers): + mm_k = Dense(hidden_sizes[i], activation='softplus', kernel_initializer='he_uniform', + bias_initializer='zeros')(mm_k) + + mm_k_dropout = Dropout(rate=dropout_rate)(mm_k) + + mm_reshape = Reshape( + (self.text_maxlen,))(mm_k_dropout) + + mean = Dot(axes=[1, 1])([mm_reshape, g]) + + if self.target_mode == 'classification': + out_ = Dense(2, activation='softmax')(mean) + elif self.target_mode in ['regression', 'ranking']: + out_ = Reshape((1,))(mean) + + model = Model(inputs=[query, doc], outputs=out_) + return model diff --git a/gensim/models/experimental/evaluation_metrics.py b/gensim/models/experimental/evaluation_metrics.py new file mode 100644 index 0000000000..d55f1a09c9 --- /dev/null +++ b/gensim/models/experimental/evaluation_metrics.py @@ -0,0 +1,97 @@ +import numpy as np +import logging + +logger = logging.getLogger(__name__) +logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO +) + + +def mapk(Y_true, Y_pred): + """Calculates Mean Average Precision(MAP) for a given set of Y_true, Y_pred + + Note: Currently doesn't support mapping at k. Couldn't use only map as it's a + reserved word + + Parameters + ---------- + Y_true : numpy array or list of ints either 1 or 0 + Contains the true, ground truth values of the relevance between a query and document + Y_pred : numpy array or list of floats + Contains the predicted similarity score between a query and document + + Examples + -------- + >>> Y_true = [[0, 1, 0, 1], [0, 0, 0, 0, 1, 0], [0, 1, 0]] + >>> Y_pred = [[0.1, 0.2, -0.01, 0.4], [0.12, -0.43, 0.2, 0.1, 0.99, 0.7], [0.5, 0.63, 0.92]] + >>> print(mapk(Y_true, Y_pred)) + 0.75 + """ + aps = [] + n_skipped = 0 + for y_true, y_pred in zip(Y_true, Y_pred): + # skip datapoints where there is no solution + if np.sum(y_true) < 1: + n_skipped += 1 + continue + + pred_sorted = sorted(zip(y_true, y_pred), key=lambda x: x[1], reverse=True) + avg = 0 + n_relevant = 0 + + for i, val in enumerate(pred_sorted): + if val[0] == 1: + avg += 1. / (i + 1.) + n_relevant += 1 + + if n_relevant != 0: + ap = avg / n_relevant + aps.append(ap) + return np.mean(np.array(aps)) + + +def mean_ndcg(Y_true, Y_pred, k=10): + """Calculates the mean discounted normalized cumulative gain over all + the entries limited to the integer k + + Parameters + ---------- + Y_true : numpy array or list of ints either 1 or 0 + Contains the true, ground truth values of the relevance between a query and document + Y_pred : numpy array or list of floats + Contains the predicted similarity score between a query and document + + + Examples + -------- + >>> Y_true = [[0, 1, 0, 1], [0, 0, 0, 0, 1, 0], [0, 1, 0]] + >>> Y_pred = [[0.1, 0.2, -0.01, 0.4], [0.12, -0.43, 0.2, 0.1, 0.19, 0.7], [0.5, 0.63, 0.72]] + >>> for k in [1, 3, 5, 10]: + ... print("nDCG@{} is {}".format(k, mean_ndcg(Y_true, Y_pred, k))) + nDCG@1 is 0.3333333333333333 + nDCG@3 is 0.7103099178571526 + nDCG@5 is 0.7103099178571526 + nDCG@10 is 0.7103099178571526 + + """ + ndcgs = [] + n_skipped = 0 + for y_true, y_pred in zip(Y_true, Y_pred): + if np.sum(y_true) < 1: + n_skipped += 1 + continue + pred_sorted = sorted(zip(y_true, y_pred), key=lambda x: x[1], reverse=True) + true_sorted = sorted(zip(y_true, y_pred), key=lambda x: x[0], reverse=True) + pred_sorted = pred_sorted[:k] + true_sorted = true_sorted[:k] + dcg = 0 + for i, val in enumerate(pred_sorted): + if val[0] == 1: + dcg += 1. / np.log2(i + 2) + idcg = 0 + for i, val in enumerate(true_sorted): + if val[0] == 1: + idcg += 1. / np.log2(i + 2) + if idcg != 0: + ndcgs.append(dcg / idcg) + return np.mean(np.array(ndcgs)) diff --git a/gensim/models/experimental/experimental_data/get_data.py b/gensim/models/experimental/experimental_data/get_data.py new file mode 100644 index 0000000000..95b50a74a0 --- /dev/null +++ b/gensim/models/experimental/experimental_data/get_data.py @@ -0,0 +1,89 @@ +""" +Utility script to download the datsets for Similarity Learning +Currently supports: +- WikiQA +- Quora Duplicate Question Pairs +- Glove 6 Billion tokens Word Embeddings + +Example Usage: +To get wikiqa +$ python get_data.py --datafile wikiqa + +To get quoraqp +$ python get_data.py --datafile quoraqp + +To get Glove Word Embeddings +$ python get_data.py --datafile glove +""" +import requests +import argparse +import zipfile +import logging +import os +import gensim.downloader as api + +logger = logging.getLogger(__name__) + +# The urls and filepaths of currently supported files +wikiqa_url, wikiqa_file = "https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/", "WikiQACorpus.zip" # noqa +quoraqp_url, quoraqp_file = "http://qim.ec.quoracdn.net/", "quora_duplicate_questions.tsv" + + +def download(url, file_name, output_dir, unzip=False): + """Utility function to download a given file from the given url + Paramters: + --------- + url: str + Url of the file, without the file + + file_name: str + name of the file ahead of the url path + + Example: + url = www.example.com/datasets/ + file_name = example_dataset.zip + """ + logger.info("Downloading %s" % file_name) + req = requests.get(url + file_name) + file_save_path = os.path.join(output_dir, file_name) + try: + with open(file_save_path, "wb") as code: + code.write(req.content) + logger.info("Download of %s complete" % file_name) + except Exception as e: + logger.info(str(e)) + + if unzip: + logger.info("Unzipping %s" % file_name) + with zipfile.ZipFile(file_save_path, "r") as zip_ref: + zip_ref.extractall(path=output_dir) + logger.info("Unzip complete") + + +if __name__ == '__main__': + logging.basicConfig( + format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', + level=logging.INFO + ) + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument('--datafile', default='all', + help='file you want to download. Options: wikiqa, quoraqp, glove, all') + parser.add_argument('--output_dir', default='./', + help='the directory where you want to save the data') + + args = parser.parse_args() + if args.datafile == 'wikiqa': + download(wikiqa_url, wikiqa_file, args.output_dir, unzip=True) + elif args.datafile == 'quoraqp': + download(quoraqp_url, quoraqp_file, args.output_dir) + elif args.datafile == 'glove': + api.load('glove-wiki-gigaword-50') + elif args.datafile == 'all': + logger.info("Downloading all files.") + download(wikiqa_url, wikiqa_file, args.output_dir, unzip=True) + download(quoraqp_url, quoraqp_file, args.output_dir) + api.load('glove-wiki-gigaword-50') + else: + logger.info("Unknown dataset %s" % args.datafile) diff --git a/gensim/test/test_data/drmm_tks b/gensim/test/test_data/drmm_tks new file mode 100644 index 0000000000..4a4355c043 Binary files /dev/null and b/gensim/test/test_data/drmm_tks differ diff --git a/gensim/test/test_data/drmm_tks.keras b/gensim/test/test_data/drmm_tks.keras new file mode 100644 index 0000000000..d05d52ed1c Binary files /dev/null and b/gensim/test/test_data/drmm_tks.keras differ diff --git a/gensim/test/test_drmm_tks.py b/gensim/test/test_drmm_tks.py new file mode 100644 index 0000000000..6e5a08b193 --- /dev/null +++ b/gensim/test/test_drmm_tks.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for checking transformation algorithms (the models package). +""" + +import unittest +import gensim.downloader as api +from gensim.test.utils import datapath, get_tmpfile +from gensim.models.experimental import DRMM_TKS + + +class TestDrmmTksModel(unittest.TestCase): + + def testLoadModel(self): + model = DRMM_TKS.load(datapath('drmm_tks')) + self.assertTrue(model.model is not None) + self.assertTrue(model._get_pair_list is not None) + self.assertTrue(model._get_batch_iter is not None) + + def testSaveModel(self): + model = DRMM_TKS.load(datapath('drmm_tks')) + model.save(get_tmpfile('temp_drmm_tks_model')) + + def testTrainModel(self): + queries = ["When was World War 1 fought ?".lower().split(), "When was Gandhi born ?".lower().split()] + docs = [["The world war was bad".lower().split(), "It was fought in 1996".lower().split()], ["Gandhi was born" + " in the 18th century".lower().split(), "He fought for the Indian freedom movement".lower().split(), + "Gandhi was assasinated".lower().split()]] + labels = [[0, 1], [1, 0, 0]] + word_embeddings_kv = api.load('glove-wiki-gigaword-50') + model = DRMM_TKS(queries, docs, labels, word_embedding=word_embeddings_kv, verbose=0) # noqa:F841