# Elastic Client

In [1]:
from ltr.client import ElasticClient
client = ElasticClient()

# Download & Build Index (run once)

If you don't already have the downloaded dependencies; if you don't have TheMovieDB data indexed run this

In [None]:
def add_collection_name(src_movie, base_doc):
    if 'belongs_to_collection' in src_movie and src_movie['belongs_to_collection'] is not None:
        if 'name' in src_movie['belongs_to_collection']:
            base_doc['collection_name'] = src_movie['belongs_to_collection']['name']
    return base_doc

from ltr.index import rebuild_tmdb
rebuild_tmdb(client, enrich=add_collection_name)

## Features for movie titles

We'll be searching movie titles (think searching for a specific movie on Netflix). And we have a set of judgments around the appropriatte movie to return. IE search for "Star Wars" return good star wars matches, in quality order...

These cover various aspects of the problem (searching title by phrase, title bm25 score, release date, etc). We'll use this to explore and analyze a simple model

In [2]:
config = {"validation": {
              "index": "tmdb",
              "params": {
                  "keywords": "rambo"
              }
    
           },
           "featureset": {
            "features": [
            {
                "name": "title_phrase",
                "params": ["keywords"],
                "template": {
                    "constant_score": {
                        "filter": {
                            "match_phrase": {"title": "{{keywords}}"}
                        },
                        "boost": 1.0
                    }  
                }
            },
            {
                "name": "title",
                "params": ["keywords"],
                "template": {
                    "constant_score": {
                        "filter": {
                            "match": {"title": "{{keywords}}"}
                        },
                        "boost": 1.0
                    }  
                }
            },
            {
                "name": "title_bm25",
                "params": ["keywords"],
                "template": {
                    "match": {"title": "{{keywords}}"}
                }
            },
            {
                "name": "overview_bm25",
                "params": ["keywords"],
                "template": {
                    "match": {"overview": "{{keywords}}"}
                }
            },
            {
                "name": "overview_phrase_bm25",
                "params": ["keywords"],
                "template": {
                    "match_phrase": {"overview": "{{keywords}}"}
                }
            },
            {
                "name": "title_fuzzy",
                "params": ["keywords"],
                "template": {
                    "match": {"title": 
                                {"query": "{{keywords}}",
                                 "fuzziness": "AUTO"}}
                }
            },
             {
                "name": "release_year",
                "params": [],
                "template": {
                    "function_score": {
                        "field_value_factor": {
                            "field": "release_year",
                            "missing": 2000
                        },
                        "query": { "match_all": {} }
                    }
                }
            },
            {
                "name": "coll_name_bm25",
                "params": ["keywords"],
                "template": {
                    "match": {"collection_name": 
                                {"query": "{{keywords}}"}}
                }
            },
             {
                "name": "coll_name_phrase_bm25",
                "params": ["keywords"],
                "template": {
                    "match_phrase": {"collection_name": 
                                {"query": "{{keywords}}"}}
                }
            }
            
            
            ]
    }}




from ltr import setup
setup(client, config=config, index='tmdb', featureset='title_rf')

Removed Default LTR feature store [Status: 200]
Initialize Default LTR feature store [Status: 200]
Create title_rf feature set [Status: 201]


## Training Set Generation

Log out features for each of the above queries out to a training set file

In [3]:
from ltr.log import judgments_to_training_set
trainingSet = judgments_to_training_set(client, 
                                        judgmentInFile='data/title_judgments.txt', 
                                        trainingOutFile='data/title_rf_judgments_train.txt', 
                                        featureSet='title_rf')

Recognizing 40 queries...
Searching tmdb - [{'terms': {'_id': [ [Status: 200]
REBUILDING TRAINING DATA for rambo (0/40)
Searching tmdb - [{'terms': {'_id': [ [Status: 200]
REBUILDING TRAINING DATA for rocky (1/40)
Searching tmdb - [{'terms': {'_id': [ [Status: 200]
REBUILDING TRAINING DATA for war games (2/40)
Searching tmdb - [{'terms': {'_id': [ [Status: 200]
REBUILDING TRAINING DATA for crocodile dundee (3/40)
Searching tmdb - [{'terms': {'_id': [ [Status: 200]
REBUILDING TRAINING DATA for matrix (4/40)
Searching tmdb - [{'terms': {'_id': [ [Status: 200]
REBUILDING TRAINING DATA for contact (5/40)
Searching tmdb - [{'terms': {'_id': [ [Status: 200]
REBUILDING TRAINING DATA for space jam (6/40)
Searching tmdb - [{'terms': {'_id': [ [Status: 200]
REBUILDING TRAINING DATA for battlestar galactica (7/40)
Searching tmdb - [{'terms': {'_id': [ [Status: 200]
REBUILDING TRAINING DATA for her (8/40)
Searching tmdb - [{'terms': {'_id': [ [Status: 200]
REBUILDING TRAINING DATA for jobs (9/40)


# Random Feature Selections

Feature searches are very time consuming for anything other than trivial data. To deal with feature dependencies, one strategy is to select a random subset of features at every decision tree split for consideration. This prevents overfitting and allows feature impacts to give a more accurate impact to how they effect the relevance.

In [4]:
from ltr.train import kcv
res  = kcv(client,
            trainingInFile='data/title_rf_judgments_train.txt',
            metric2t='NDCG@10',
            leafs=4,
            trees=100,
            ranker=8, # Use a "Random Forests Model"
            frate=0.5,
            bag=1, # Number of ensembles in the forest bag=1, 1 LambdaMART model with random features chosen
            index='tmdb',
            kcv=5,
            features=[1,2,3,4,5,6,7,8,9],
            featureSet='title_rf',
            modelName='title_rf')

print()
print("Impact of each feature on the model")
for ftrId, impact in res.trainingLogs[0].impacts.items():
    print("{} - {}".format(ftrId, impact))
    
    
print("Test NDCG@10 %s" % res.kcvTestAvg)

Running java -jar data/RankyMcRankFace.jar -ranker 8 -metric2t NDCG@10 -tree 100 -bag 1 -leaf 4 -train data/title_rf_judgments_train.txt -save data/title_rf_model.txt  -feature features.txt  -kcv 5 
DONE

Impact of each feature on the model
3 - 9.382218556036284
6 - 1.9349179859698624
4 - 1.2775141769150764
1 - 0.9987069005026618
2 - 0.23289798113217317
7 - 0.15750725027092136
5 - 0.005959954174888
8 - 0.0
9 - 0.0
Test NDCG@10 0.8807


# Train the model

In [7]:
from ltr.train import train
res  = train(client,
             trainingInFile='data/title_rf_judgments_train.txt',
             metric2t='NDCG@10',
             leafs=4,
             trees=100,
             ranker=8, # Use a "Random Forests Model"
             frate=0.5,
             bag=3, # Number of ensembles in the forest bag=1, 1 LambdaMART model with random features chosen
             index='tmdb',
             features=[1,2,3,4,5,6,7,8,9],
             featureSet='title_rf',
             modelName='title_rf')

Running java -jar data/RankyMcRankFace.jar -ranker 8 -metric2t NDCG@10 -tree 100 -bag 3 -leaf 4 -train data/title_rf_judgments_train.txt -save data/title_rf_model.txt  -feature features.txt 
DONE
Delete model title_rf: 200
Created Model title_rf [Status: 201]


# Whoopsie Report

In [8]:
from ltr.client import ElasticClient
from ltr.MART_model import eval_model
from ltr.judgments import judgments_from_file, judgments_by_qid

features, _ = client.feature_set(index='tmdb', name='title_rf')

judgmentDict = judgments_by_qid(judgments_from_file(filename='data/title_rf_judgments_train.txt'))


rambo=judgmentDict[1]
model = eval_model(modelName='title_rf',
                       features=features,
                       judgments=rambo)

print()
print("## Evaluating graded docs for search keywords '%s'" % rambo[0].keywords)
print()
print(model)

Fetched FeatureSet title_rf [Status: 200]
Recognizing 40 queries...


ValueError: Whoopsies only support LambdaMART of Random Forest of bags=1