In [1]:
import numpy as np
import requests
import re
from rank_bm25 import BM25Okapi
import xmltodict

In [2]:
"""
    Takes in geonetwork xml ranking output.
    
    Outputs a list of documents represented
    as dictionaries.
    
    The list is ordered according to the
    geonetwork ranking.
"""
def make_doclist(geonetwork_output_xml):
    
    with open(geonetwork_output_xml) as fd:
        doc = xmltodict.parse(fd.read())
        
    return [dict(d) for d in dict(doc['response'])['metadata']]

In [3]:
"""
    Collect relevant information from a single
    document dictionary into a new dictionary.
    
    The slimmer representation only takes the 
    abstract, keyword set and url.
"""
def make_clean_doc(document):
    
    clean_doc = {}
    
    clean_doc['abstract'] = document['abstract'] # single string
    clean_doc['keywords'] = document['keyword'] # list of keyword
    clean_doc['url'] = document['identifier'] # link to the dataset
    
    return clean_doc

In [4]:
"""
    Turns al the documents from a geonetwork ranking
    into the slimmer representation.
    
    Output is the same ordering as geonetwork gives.
"""
def clean_doclist(doclist):
    docset = [None] * len(doclist)
    
    for i, d in enumerate(doclist):
        docset[i] = make_clean_doc(d)
    
    return docset

In [5]:
def handle_geonetwork_xml(geonetwork_output_xml):
    return clean_doclist(make_doclist(geonetwork_output_xml))

In [6]:
"""
    Turns a clean doclist into a list of lists where
    each sublist represents a document and each element
    in the sublist is a string that exists in the document.
    
    This is a simple bag of words type representation
    used by e.g. okapi BM250 module.
"""
def make_bm250_corpus(clean_doclist):
    serialized = [None] * len(clean_doclist)
    
    for i, d in enumerate(clean_doclist):
        
        abstract = clean_doclist[i]['abstract'].split()        
        serialized[i] = [x.lower() for x in abstract + clean_doclist[i]['keywords']   ]  
        
    return serialized

In [7]:
class RRS:
    def __init__(self, method):
        self.method = method
        self.corpus = None
        
        
    def rank(self, query, old_ranking, old_ranking_format):
        
        if old_ranking_format == 'gnxml':
            self.corpus = handle_geonetwork_xml(old_ranking)

            if self.method == 'BM250kapi':
                bm250kapi = BM25Okapi( make_bm250_corpus(self.corpus) )
                scores = bm250kapi.get_scores(query)
                
                doc_scorings = sorted([ (scores[i], x['url']) for i, x in enumerate(self.corpus)], key=lambda tup: tup[0])
                doc_scorings.reverse()
                
                return doc_scorings
        
        
    def insert_feedback(self, feedback):
        
        if self.method == 'BM250kapi':
            return 'current ranking method has not feedback learning mechanism, please make a different instance of RRS with a learning method'

In [8]:
rrs_BM25 = RRS('BM250kapi')
ranking = rrs_BM25.rank(['air','temperature'], 'output_climate.xml', 'gnxml')
ranking

[(1.4551969422258917,
  'http://w3.avignon.inra.fr/geonetwork_anaee/srv/eng/catalog.search#/metadata/5934deaf-ee90-4338-b9a5-ecb6f200d0f3'),
 (1.1673837780400242,
  'http://w3.avignon.inra.fr/geonetwork_anaee/d16472fe-ca94-4e35-8dac-934eeced2ea4'),
 (1.0945819450550336,
  'http://w3.avignon.inra.fr/geonetwork_anaee/c32a4228-b3b7-4eda-b06c-97d0e59c0dfb'),
 (1.0631752367484597,
  'http://w3.avignon.inra.fr/geonetwork_anaee/546ed900-ded6-4770-be14-4e702b136309'),
 (0.9916641795252035,
  'http://w3.avignon.inra.fr/geonetwork_anaee/248a7f0c-cdbf-4002-b813-1384d3bba4b0'),
 (0.9353622768269519,
  'http://w3.avignon.inra.fr/geonetwork_anaee/80894c9d-aa65-4808-8870-a04a7bf402f4'),
 (0.9025665213209669,
  'http://w3.avignon.inra.fr/geonetwork_anaee/srv/eng/catalog.search#/metadata/ee19974d-36a4-4803-89a5-2637eb6d58d9'),
 (0.6792817854251807,
  'http://w3.avignon.inra.fr/geonetwork_anaee/50264564-795a-45e0-bc28-7ef8568b62fa'),
 (0.6715291128741324,
  'http://w3.avignon.inra.fr/geonetwork_anaee/sr

In [9]:
rrs_BM25.insert_feedback("some feedback input stream")

'current ranking method has not feedback learning mechanism, please make a different instance of RRS with a learning method'