diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index c7dfee18d6..a7d3b19ded 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -69,6 +69,10 @@ Modules: models/deprecated/fasttext_wrapper models/base_any2vec models/experimental/drmm_tks + models/experimental/custom_callbacks + models/experimental/custom_layers + models/experimental/custom_losses + models/experimental/evaluation_metrics similarities/docsim similarities/index sklearn_api/atmodel diff --git a/docs/src/models/experimental/custom_callbacks.rst b/docs/src/models/experimental/custom_callbacks.rst new file mode 100644 index 0000000000..4fdf371992 --- /dev/null +++ b/docs/src/models/experimental/custom_callbacks.rst @@ -0,0 +1,9 @@ +:mod:`models.experimental.custom_callbacks` -- Custom Callbacks for Similarity Learning +======================================================================================= + +.. automodule:: gensim.models.experimental.custom_callbacks + :synopsis: Custom Callbacks for Similarity Learning + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/models/experimental/custom_layers.rst b/docs/src/models/experimental/custom_layers.rst new file mode 100644 index 0000000000..51cc70f63e --- /dev/null +++ b/docs/src/models/experimental/custom_layers.rst @@ -0,0 +1,9 @@ +:mod:`models.experimental.custom_layers` -- Custom Layers for Similarity Learning +================================================================================= + +.. automodule:: gensim.models.experimental.custom_layers + :synopsis: Custom Layers for Similarity Learning + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/models/experimental/custom_losses.rst b/docs/src/models/experimental/custom_losses.rst new file mode 100644 index 0000000000..f59afcfaa5 --- /dev/null +++ b/docs/src/models/experimental/custom_losses.rst @@ -0,0 +1,9 @@ +:mod:`models.experimental.custom_losses` -- Loss for Similarity Learning +======================================================================== + +.. automodule:: gensim.models.experimental.custom_losses + :synopsis: Loss functions for Similarity Learning + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/models/experimental/drmm_tks.rst b/docs/src/models/experimental/drmm_tks.rst index e66d42b809..d569eac61c 100644 --- a/docs/src/models/experimental/drmm_tks.rst +++ b/docs/src/models/experimental/drmm_tks.rst @@ -1,5 +1,5 @@ -:mod:`models.experimental.drmm_tks` -- Similarity Learning -============================================================================ +:mod:`models.experimental.drmm_tks` -- Neural Nets for Similarity Learning +========================================================================== .. automodule:: gensim.models.experimental.drmm_tks :synopsis: Neural Network Similarity Learning diff --git a/docs/src/models/experimental/evaluation_metrics.rst b/docs/src/models/experimental/evaluation_metrics.rst new file mode 100644 index 0000000000..2d47acd9c7 --- /dev/null +++ b/docs/src/models/experimental/evaluation_metrics.rst @@ -0,0 +1,9 @@ +:mod:`models.experimental.evaluation_metrics` -- Evaluation Metrics for Similarity Learning +=========================================================================================== + +.. automodule:: gensim.models.experimental.evaluation_metrics + :synopsis: Evaluation Metrics for Similarity Learning + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/gensim/models/experimental/custom_callbacks.py b/gensim/models/experimental/custom_callbacks.py index 29edf3d522..737fa21f02 100644 --- a/gensim/models/experimental/custom_callbacks.py +++ b/gensim/models/experimental/custom_callbacks.py @@ -17,19 +17,18 @@ def __init__(self, test_data): Parameters ---------- test_data : dict - A dictionary which holds the validation data - It consists of the following keys: - "X1" : numpy array + A dictionary which holds the validation data. It consists of the following keys: + - "X1" : numpy array The queries as a numpy array of shape (n_samples, text_maxlen) - "X2" : numpy array + - "X2" : numpy array The candidate docs as a numpy array of shape (n_samples, text_maxlen) - "y" : list of int - It is the labels for each of the query-doc pairs as a 1 or 0 with shape (n_samples,) - where 1: doc is relevant to query - 0: doc is not relevant to query - "doc_lengths" : list of int - It contains the length of each document group. I.e., the number of queries - which represent one topic. It is needed for calculating the metrics. + - "y" : list of int + It is the labels for each of the query-doc pairs as a 1 or 0 with shape (n_samples,) + where 1 : doc is relevant to query, 0 : doc is not relevant to query + - "doc_lengths" : list of int + It contains the length of each document group. I.e., the number of queries + which represent one topic. It is needed for calculating the metrics. + """ if not KERAS_AVAILABLE: diff --git a/gensim/models/experimental/custom_layers.py b/gensim/models/experimental/custom_layers.py index aec5325baf..250bca6c29 100644 --- a/gensim/models/experimental/custom_layers.py +++ b/gensim/models/experimental/custom_layers.py @@ -15,10 +15,10 @@ def __init__(self, output_dim, topk, **kwargs): Parameters ---------- - output_dim : tuple of ints - The dimension of the tensor after going through this layer + output_dim : tuple of int + The dimension of the tensor after going through this layer. topk : int - The k topmost values to be returned + The k topmost values to be returned. """ self.output_dim = output_dim self.topk = topk diff --git a/gensim/models/experimental/drmm_tks.py b/gensim/models/experimental/drmm_tks.py index 04866cd6a3..39a50b1fb7 100644 --- a/gensim/models/experimental/drmm_tks.py +++ b/gensim/models/experimental/drmm_tks.py @@ -13,9 +13,8 @@ Abbreviations ============= -DRMM : Deep Relevance Matching Model - -TKS : Top K Solutions +- DRMM : Deep Relevance Matching Model +- TKS : Top K Solutions About DRMM_TKS ============== @@ -33,15 +32,13 @@ The trained model needs to be trained on data in the format: ->>> queries = ["When was World War 1 fought ?".lower().split(), -... "When was Gandhi born ?".lower().split()] ->>> docs = [["The world war was bad".lower().split(), -... "It was fought in 1996".lower().split()], -... ["Gandhi was born in the 18th century".lower().split(), -... "He fought for the Indian freedom movement".lower().split(), -... "Gandhi was assasinated".lower().split()]] ->>> labels = [[0, 1], [1, 0, 0]] +>>> from gensim.models.experimental import DRMM_TKS >>> import gensim.downloader as api +>>> queries = ["When was World War 1 fought ?".lower().split(), "When was Gandhi born ?".lower().split()] +>>> docs = [["The world war was bad".lower().split(), "It was fought in 1996".lower().split()], ["Gandhi was born in" +... "the 18th century".lower().split(), "He fought for the Indian freedom movement".lower().split(), +... "Gandhi was assasinated".lower().split()]] +>>> labels = [[0, 1], [1, 0, 0]] >>> word_embeddings_kv = api.load('glove-wiki-gigaword-50') >>> model = DRMM_TKS(queries, docs, labels, word_embedding=word_embeddings_kv, verbose=0) @@ -59,27 +56,24 @@ Testing on new data : ->>> queries = ["how are glacier caves formed ?".lower().split()] ->>> docs = ["A partly submerged glacier cave on Perito Moreno Glacier".lower().split(), -... "A glacier cave is a cave formed within the ice of a glacier".lower().split()] - - -Predicting on new data : - >>> from gensim.test.utils import datapath >>> model = DRMM_TKS.load(datapath('drmm_tks')) ->>> print(model.predict([["hello", "world"]], [["i", "am", "happy"], ["good", "morning"]])) -[[0.99346054] - [0.999115 ] - [0.9989991 ]] +>>> +>>> queries = ["how are glacier caves formed ?".lower().split()] +>>> docs = [["A partly submerged glacier cave on Perito Moreno Glacier".lower().split(), "glacier cave is cave formed" +... " within the ice of glacier".lower().split()]] +>>> print(model.predict(queries, docs)) +[[0.9915068 ] + [0.99228466]] +>>> print(model.predict([["hello", "world"]], [[["i", "am", "happy"], ["good", "morning"]]])) +[[0.9975487] + [0.999115 ]] -More information can be found in: +More information can be found in: `Jiafeng Guo, Yixing Fan, Qingyao Ai, W. Bruce Croft "A Deep Relevance Matching Model for Ad-hoc Retrieval" `_ - `MatchZoo Repository `_ - `Similarity Learning Wikipedia Page `_ """ @@ -224,8 +218,8 @@ def __init__(self, queries=None, docs=None, labels=None, word_embedding=None, The candidate answers for the similarity learning model. labels: iterable list of list of int, optional Indicates when a candidate document is relevant to a query - 1 : relevant - 0 : irrelevant + - 1 : relevant + - 0 : irrelevant word_embedding : :class:`~gensim.models.keyedvectors.KeyedVectors`, optional a KeyedVector object which has the embeddings pre-loaded. If None, random word embeddings will be used. @@ -249,22 +243,20 @@ def __init__(self, queries=None, docs=None, labels=None, word_embedding=None, the way the model should be trained, either to rank or classify verbose : {0, 1, 2} the level of information shared while training - 0 = silent, 1 = progress bar, 2 = one line per epoch + - 0 : silent + - 1 : progress bar + - 2 : one line per epoch Examples -------- The trained model needs to be trained on data in the format - >>> queries = ["When was World War 1 fought ?".lower().split(), - ... "When was Gandhi born ?".lower().split()] - >>> docs = [["The world war was bad".lower().split(), - ... "It was fought in 1996".lower().split()], - ... ["Gandhi was born in the 18th century".lower().split(), - ... "He fought for the Indian freedom movement".lower().split(), - ... "Gandhi was assasinated".lower().split()]] - >>> labels = [[0, 1], - ... [1, 0, 0]] + >>> queries = ["When was World War 1 fought ?".lower().split(), "When was Gandhi born ?".lower().split()] + >>> docs = [["The world war was bad".lower().split(), "It was fought in 1996".lower().split()], ["Gandhi was" + ... "born in the 18th century".lower().split(), "He fought for the Indian freedom movement".lower().split(), + ... "Gandhi was assasinated".lower().split()]] + >>> labels = [[0, 1], [1, 0, 0]] >>> import gensim.downloader as api >>> word_embeddings_kv = api.load('glove-wiki-gigaword-50') >>> model = DRMM_TKS(queries, docs, labels, word_embedding=word_embeddings_kv, verbose=0) @@ -292,8 +284,9 @@ def __init__(self, queries=None, docs=None, labels=None, word_embedding=None, self._get_full_batch_iter = _get_full_batch_iter if self.target_mode not in ['ranking', 'classification']: - raise ValueError("Unkown target_mode %s. It must be either" - "'ranking' or 'classification'" % self.target_mode) + raise ValueError( + "Unkown target_mode %s. It must be either 'ranking' or 'classification'" % self.target_mode + ) if unk_handle_method not in ['random', 'zero']: raise ValueError("Unkown token handling method %s" % str(unk_handle_method)) @@ -346,8 +339,7 @@ def build_vocab(self, queries, docs, labels, word_embedding): # Initialize the embedding matrix # UNK word gets the vector based on the method if self.unk_handle_method == 'random': - self.embedding_matrix = np.random.uniform(-0.2, 0.2, - (self.vocab_size, self.embedding_dim)) + self.embedding_matrix = np.random.uniform(-0.2, 0.2, (self.vocab_size, self.embedding_dim)) elif self.unk_handle_method == 'zero': self.embedding_matrix = np.zeros((self.vocab_size, self.embedding_dim)) @@ -361,9 +353,10 @@ def build_vocab(self, queries, docs, labels, word_embedding): # Creates the same random vector for the given string each time self.embedding_matrix[i] = self._seeded_vector(word, self.embedding_dim) n_non_embedding_words += 1 - logger.info("There are %d words out of %d (%.2f%%) not in the embeddings. Setting them to %s" % - (n_non_embedding_words, self.vocab_size, n_non_embedding_words * 100 / self.vocab_size, - self.unk_handle_method)) + logger.info( + "There are %d words out of %d (%.2f%%) not in the embeddings. Setting them to %s", n_non_embedding_words, + self.vocab_size, n_non_embedding_words * 100 / self.vocab_size, self.unk_handle_method + ) # Include embeddings for words in embedding file but not in the train vocab # It will be useful for embedding words encountered in validation and test set @@ -410,11 +403,9 @@ def build_vocab(self, queries, docs, labels, word_embedding): logger.info("Normalizing the word embeddings") self.embedding_matrix = normalize(self.embedding_matrix) - logger.info("Embedding Matrix build complete. It now has shape %s" % - str(self.embedding_matrix.shape)) - logger.info("Pad word has been set to index %d" % self.pad_word_index) - logger.info("Unknown word has been set to index %d" % - self.unk_word_index) + logger.info("Embedding Matrix build complete. It now has shape %s", str(self.embedding_matrix.shape)) + logger.info("Pad word has been set to index %d", self.pad_word_index) + logger.info("Unknown word has been set to index %d", self.unk_word_index) logger.info("Embedding index build complete") self.needs_vocab_build = False @@ -566,8 +557,10 @@ def train(self, queries, docs, labels, word_embedding=None, indexed_long_query_list = self._translate_user_data(long_query_list) indexed_long_doc_list = self._translate_user_data(long_doc_list) - val_callback = ValidationCallback({"X1": indexed_long_query_list, "X2": indexed_long_doc_list, - "doc_lengths": doc_lens, "y": long_label_list}) + val_callback = ValidationCallback( + {"X1": indexed_long_query_list, "X2": indexed_long_doc_list, "doc_lengths": doc_lens, + "y": long_label_list} + ) val_callback = [val_callback] # since `model.fit` requires a list # If train is called again, not all values should be reset @@ -613,16 +606,17 @@ def _translate_user_data(self, data): translated_sentence.append(self.unk_word_index) n_skipped_words += 1 if len(sentence) > self.text_maxlen: - logger.info("text_maxlen: %d isn't big enough. Error at sentence of length %d." - "Sentence is %s" % ( - self.text_maxlen, len(sentence), str(sentence)) - ) + logger.info( + "text_maxlen: %d isn't big enough. Error at sentence of length %d." + "Sentence is %s", self.text_maxlen, len(sentence), str(sentence) + ) translated_sentence = translated_sentence + \ (self.text_maxlen - len(sentence)) * [self.pad_word_index] translated_data.append(np.array(translated_sentence)) - logger.info("Found %d unknown words. Set them to unknown word index : %d" % - (n_skipped_words, self.unk_word_index)) + logger.info( + "Found %d unknown words. Set them to unknown word index : %d", n_skipped_words, self.unk_word_index + ) return np.array(translated_data) def predict(self, queries, docs): @@ -643,9 +637,9 @@ def predict(self, queries, docs): >>> model = DRMM_TKS.load(datapath('drmm_tks')) >>> >>> queries = ["When was World War 1 fought ?".split(), "When was Gandhi born ?".split()] - >>> docs = [["The world war was bad".split(), "It was fought in 1996".split()], - ... ["Gandhi was born in the 18th century".split(), "He fought for the Indian freedom movement".split(), - ... "Gandhi was assasinated".split()]] + >>> docs = [["The world war was bad".split(), "It was fought in 1996".split()], ["Gandhi was born in the 18th" + ... " century".split(), "He fought for the Indian freedom movement".split(), "Gandhi was" + ... " assasinated".split()]] >>> print(model.predict(queries, docs)) [[0.9933108 ] [0.9925415 ] @@ -672,9 +666,9 @@ def predict(self, queries, docs): return predictions - def evaluate(self, queries, docs, labels): """Evaluates the model and provides the results in terms of metrics (MAP, nDCG) + This should ideally be called on the test set. Parameters ---------- @@ -685,7 +679,6 @@ def evaluate(self, queries, docs, labels): labels : list of list of int The relevance of the document to the query. 1 = relevant, 0 = not relevant """ - long_doc_list = [] long_label_list = [] long_query_list = [] @@ -698,19 +691,16 @@ def evaluate(self, queries, docs, labels): long_label_list.append(l) i += 1 doc_lens.append(len(doc)) - indexed_long_query_list = self._translate_user_data(long_query_list) indexed_long_doc_list = self._translate_user_data(long_doc_list) predictions = self.model.predict(x={'query': indexed_long_query_list, 'doc': indexed_long_doc_list}) Y_pred = [] Y_true = [] offset = 0 - for doc_size in doc_lens: Y_pred.append(predictions[offset: offset + doc_size]) Y_true.append(long_label_list[offset: offset + doc_size]) offset += doc_size - logger.info("MAP: %.2f", mapk(Y_true, Y_pred)) for k in [1, 3, 5, 10, 20]: logger.info("nDCG@%d : %.2f", k, mean_ndcg(Y_true, Y_pred, k=k)) diff --git a/gensim/models/experimental/evaluation_metrics.py b/gensim/models/experimental/evaluation_metrics.py index 904f1c87d8..3a8dd4ed52 100644 --- a/gensim/models/experimental/evaluation_metrics.py +++ b/gensim/models/experimental/evaluation_metrics.py @@ -3,7 +3,8 @@ logger = logging.getLogger(__name__) logging.basicConfig( - format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO +) def mapk(Y_true, Y_pred): @@ -19,8 +20,8 @@ def mapk(Y_true, Y_pred): Y_pred : numpy array or list of floats Contains the predicted similarity score between a query and document - Usage - ----- + Examples + -------- >>> Y_true = [[0, 1, 0, 1], [0, 0, 0, 0, 1, 0], [0, 1, 0]] >>> Y_pred = [[0.1, 0.2, -0.01, 0.4], [0.12, -0.43, 0.2, 0.1, 0.99, 0.7], [0.5, 0.63, 0.92]] >>> print(mapk(Y_true, Y_pred)) @@ -61,8 +62,9 @@ def mean_ndcg(Y_true, Y_pred, k=10): Y_pred : numpy array or list of floats Contains the predicted similarity score between a query and document - Usage - ----- + + Examples + -------- >>> Y_true = [[0, 1, 0, 1], [0, 0, 0, 0, 1, 0], [0, 1, 0]] >>> Y_pred = [[0.1, 0.2, -0.01, 0.4], [0.12, -0.43, 0.2, 0.1, 0.19, 0.7], [0.5, 0.63, 0.72]] >>> for k in [1, 3, 5, 10]: diff --git a/gensim/test/test_drmm_tks.py b/gensim/test/test_drmm_tks.py index d1cc038231..6e5a08b193 100644 --- a/gensim/test/test_drmm_tks.py +++ b/gensim/test/test_drmm_tks.py @@ -27,14 +27,10 @@ def testSaveModel(self): model.save(get_tmpfile('temp_drmm_tks_model')) def testTrainModel(self): - queries = ["When was World War 1 fought ?".lower().split(), - "When was Gandhi born ?".lower().split()] - docs = [["The world war was bad".lower().split(), - "It was fought in 1996".lower().split()], - ["Gandhi was born in the 18th century".lower().split(), - "He fought for the Indian freedom movement".lower().split(), + queries = ["When was World War 1 fought ?".lower().split(), "When was Gandhi born ?".lower().split()] + docs = [["The world war was bad".lower().split(), "It was fought in 1996".lower().split()], ["Gandhi was born" + " in the 18th century".lower().split(), "He fought for the Indian freedom movement".lower().split(), "Gandhi was assasinated".lower().split()]] labels = [[0, 1], [1, 0, 0]] word_embeddings_kv = api.load('glove-wiki-gigaword-50') - model = DRMM_TKS(queries, docs, labels, - word_embedding=word_embeddings_kv, verbose=0) + model = DRMM_TKS(queries, docs, labels, word_embedding=word_embeddings_kv, verbose=0) # noqa:F841