# Elastic Client

In [None]:
from ltr.client import ElasticClient
client = ElasticClient()

# Download & Build Index (run once)

If you don't already have the downloaded dependencies; if you don't have TheMovieDB data indexed run this

In [None]:
def add_collection_name(src_movie, base_doc):
    if 'belongs_to_collection' in src_movie and src_movie['belongs_to_collection'] is not None:
        if 'name' in src_movie['belongs_to_collection']:
            base_doc['collection_name'] = src_movie['belongs_to_collection']['name']
    return base_doc

from ltr.index import rebuild_tmdb
rebuild_tmdb(client, enrich=add_collection_name)

## Features for movie titles

We'll be searching movie titles (think searching for a specific movie on Netflix). And we have a set of judgments around the appropriatte movie to return. IE search for "Star Wars" return good star wars matches, in quality order...

These cover various aspects of the problem (searching title by phrase, title bm25 score, release date, etc). We'll use this to explore and analyze a simple model

In [None]:
config = {"validation": {
              "index": "tmdb",
              "params": {
                  "keywords": "rambo"
              }
    
           },
           "featureset": {
            "features": [
            {
                "name": "title_phrase",
                "params": ["keywords"],
                "template": {
                    "constant_score": {
                        "filter": {
                            "match_phrase": {"title": "{{keywords}}"}
                        },
                        "boost": 1.0
                    }  
                }
            },
            {
                "name": "title",
                "params": ["keywords"],
                "template": {
                    "constant_score": {
                        "filter": {
                            "match": {"title": "{{keywords}}"}
                        },
                        "boost": 1.0
                    }  
                }
            },
            {
                "name": "title_bm25",
                "params": ["keywords"],
                "template": {
                    "match": {"title": "{{keywords}}"}
                }
            },
            {
                "name": "overview_bm25",
                "params": ["keywords"],
                "template": {
                    "match": {"overview": "{{keywords}}"}
                }
            },
            {
                "name": "overview_phrase_bm25",
                "params": ["keywords"],
                "template": {
                    "match_phrase": {"overview": "{{keywords}}"}
                }
            },
            {
                "name": "title_fuzzy",
                "params": ["keywords"],
                "template": {
                    "match": {"title": 
                                {"query": "{{keywords}}",
                                 "fuzziness": "AUTO"}}
                }
            },
             {
                "name": "release_year",
                "params": [],
                "template": {
                    "function_score": {
                        "field_value_factor": {
                            "field": "release_year",
                            "missing": 2000
                        },
                        "query": { "match_all": {} }
                    }
                }
            },
            {
                "name": "coll_name_bm25",
                "params": ["keywords"],
                "template": {
                    "match": {"collection_name": 
                                {"query": "{{keywords}}"}}
                }
            },
             {
                "name": "coll_name_phrase_bm25",
                "params": ["keywords"],
                "template": {
                    "match_phrase": {"collection_name": 
                                {"query": "{{keywords}}"}}
                }
            }
            
            
            ]
    }}




from ltr import setup
setup(client, config=config, index='tmdb', featureset='title_rf')

## Training Set Generation

Log out features for each of the above queries out to a training set file

In [None]:
from ltr.log import judgments_to_training_set
trainingSet = judgments_to_training_set(client, 
                                        judgmentInFile='data/title_judgments.txt', 
                                        trainingOutFile='data/title_rf_judgments_train.txt', 
                                        featureSet='title_rf')

# Random Feature Selections

Feature searches are very time consuming for anything other than trivial data. To deal with feature dependencies, one strategy is to select a random subset of features at every decision tree split for consideration. This prevents overfitting and allows feature impacts to give a more accurate impact to how they effect the relevance.

In [None]:
from ltr.train import kcv
res  = kcv(client,
            trainingInFile='data/title_rf_judgments_train.txt',
            metric2t='NDCG@10',
            leafs=4,
            trees=100,
            ranker=8, # Use a "Random Forests Model"
            frate=0.5,
            bag=1, # Number of ensembles in the forest bag=1, 1 LambdaMART model with random features chosen
            index='tmdb',
            kcv=5,
            features=[1,2,3,4,5,6,7,8,9],
            featureSet='title_rf',
            modelName='title_rf')

print()
print("Impact of each feature on the model")
for ftrId, impact in res.trainingLogs[0].impacts.items():
    print("{} - {}".format(ftrId, impact))
    
    
print("Test NDCG@10 %s" % res.kcvTestAvg)

# Train the model

In [None]:
from ltr.train import train
res  = train(client,
             trainingInFile='data/title_rf_judgments_train.txt',
             metric2t='NDCG@10',
             leafs=4,
             trees=100,
             ranker=8, # Use a "Random Forests Model"
             frate=0.5,
             bag=3, # Number of ensembles in the forest bag=1, 1 LambdaMART model with random features chosen
             index='tmdb',
             features=[1,2,3,4,5,6,7,8,9],
             featureSet='title_rf',
             modelName='title_rf')