Download corpus, judgments, etc

In [None]:
from ltr import download
corpus='http://es-learn-to-rank.labs.o19s.com/blog.jsonl'
judgments='http://es-learn-to-rank.labs.o19s.com/osc_judgments.txt'

download([corpus, judgments], dest='data/');

Parse out OSC's blog into `articles`

In [None]:
import json

articles = []

with open('data/blog.jsonl') as f:
    for line in f:
        blog = json.loads(line)
        articles.append(blog)

articles[-7]

Instantiate an Elasticsearch client (as opposed to a `SolrClient`). Hello LTR can work with either 

In [None]:
from ltr.client import ElasticClient
client=ElasticClient()

Reindex from the corpus into the `blog` index. The JSON file at `docker/elasticsearch/<index_name>_settings.json` is loaded to configure the index.

In [None]:
from ltr.index import rebuild
rebuild(client, index='blog', doc_src=articles)

A set of features that we've come up with that seems to work well for OSC's blog. Note here, these are Elasticsearch specific

In [None]:
client.reset_ltr(index='tmdb')

config = {
    "featureset": {
        "features": [
            {
                "name": "title_term_match",
                "params": ["keywords"],
                "template": {
                    "constant_score": {
                       "filter": {
                            "match": {
                                "title": "{{keywords}}"
                            }
                       },
                       "boost": 1.0
                    }
                }
            },
           {
                "name": "content_bm25",
                "params": ["keywords"],
                "template": {
                    "match": {
                       "content": {
                          "query": "{{keywords}}"
                        }
                    }
                }
            },
            {
                "name": "title_phrase_bm25",
                "params": ["keywords"],
                "template": {
                    "match_phrase": {
                       "title": "{{ keywords }}"
                    }
                }
            },
            {
                "name": "title_phrase_match",
                "params": ["keywords"],
                "template": {
                    "constant_score": {
                       "filter": {
                            "match_phrase": {
                                "title": "{{keywords}}"
                            }
                       },
                       "boost": 1.0
                    }
                }
            },
            
            {
                "name": "stepwise_post_date",
                "params": ["keywords"],
                "template": {
                  "function_score": {
                     "query": {
                        "match_all": {
                        }
                     },
                     "boost_mode": "replace",
                     "score_mode": "sum",
                     "functions": [
                        {
                            "filter": {
                                "range": {
                                    "post_date": {
                                        "gte": "now-180d"
                                    }
                                }
                            },
                            "weight": "100"               
                        },
                        {
                            "filter": {
                                "range": {
                                    "post_date": {
                                        "gte": "now-360d"
                                    }
                                }
                            },
                            "weight": "100"               
                        },
                          {
                            "filter": {
                                "range": {
                                    "post_date": {
                                        "gte": "now-90d"
                                    }
                                }
                            },
                            "weight": "100"               
                        }

                     ]
                  }
                }
            },
            {
                "name": "category_phrase_bm25",
                "params": ["keywords"],
                "template": {
                    "match_phrase": {
                       "categories": "{{ keywords }}"
                    }
                }
            },
            {
                "name": "excerpt_bm25",
                "params": ["keywords"],
                "template": {
                    "match": {
                       "excerpt": "{{ keywords }}"
                    }
                }
            },
            {
                "name": "excerpt_phrase_bm25",
                "params": ["keywords"],
                "template": {
                    "match_phrase": {
                       "excerpt": "{{ keywords }}"
                    }
                }
            },
        ]
    },
    "validation": {
      "index": "blog",
      "params": {
          "keywords": "rambo"
      }

   }
}

client.create_featureset(index='blog', name='test', ftr_config=config)

With features loaded, transform the judgment list (`query,doc,label`) into a full training set with `query,doc,label,ftr1,ftr2,...` to prepare for training

In [None]:
from ltr.judgments import judgments_open
from ltr.log import FeatureLogger
from itertools import groupby

ftr_logger=FeatureLogger(client, index='blog', feature_set='test')
with judgments_open('data/osc_judgments.txt') as judgment_list:
    for qid, query_judgments in groupby(judgment_list, key=lambda j: j.qid):
        ftr_logger.log_for_qid(judgments=query_judgments, 
                               qid=qid,
                               keywords=judgment_list.keywords(qid))

Train using RankyMcRankFace with the training set, optimizing search for a specific metric (here `NDCG@10`). Note `ltr.train` has additional capabilities for performing k-fold cross validaiton to ensure the model isn't overfit to training data.

The model is stored in the search engine named `test` which can be referred to later for searching.

In [None]:
from ltr.ranklib import train
trainLog = train(client,
                 training_set=ftr_logger.logged,
                 metric2t='NDCG@10',
                 featureSet='test',
                 index='blog',
                 modelName='test')



In [None]:
!java -jar /var/folders/7_/cvjz84n54vx7zv_pw3gmdqr00000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 50 -bag 1 -leaf 10 -frate 1.0 -srate 1.0 -train /var/folders/7_/cvjz84n54vx7zv_pw3gmdqr00000gn/T/training.txt

Search! Pass some configuration in (`blog_fields`) for display purposes.

In [None]:
blog_fields = {
    'title': 'title',
    'display_fields': ['url', 'author', 'categories', 'post_date']
}

from ltr import search
search(client, "beer", modelName='test', 
       index='blog', fields=blog_fields)