# Solr Client

In [None]:
from ltr.client import SolrClient
client = SolrClient()

# Download & Build Index (run once)

If you don't already have the downloaded dependencies; if you don't have TheMovieDB data indexed run this

In [None]:
from ltr import download
download();

In [None]:
from ltr.index import rebuild_tmdb
rebuild_tmdb(client)

## Features for movie titles

We'll be searching movie titles (think searching for a specific movie on Netflix). And we have a set of judgments around the appropriatte movie to return. IE search for "Star Wars" return good star wars matches, in quality order...

These cover various aspects of the problem (searching title by phrase, title bm25 score, release date, etc). We'll use this to explore and analyze a simple model

In [None]:
config = [
    #1
    {
      "name" : "title_has_phrase",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:\"${keywords})\"^=1"
      }
    },
    #2
    {
      "name" : "title_has_terms",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})^=1"
      }
    },
    #3
    {
      "name" : "title_bm25",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})"
      }
    },
    #4
    {
      "name" : "overview_bm25",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    #5
    {
      "name" : "overview_phrase_bm25",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:\"${keywords}\""
      }
    },
    #6
    {
      "name" : "title_fuzzy",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!lucene df=title}${keywords}~"
      }
    },
    #7
    {
      "name" : "release_year",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}def(release_year,2000)"
      }
    }

]



from ltr import setup
setup(client, config=config, index='tmdb', featureset='title')

## Training Set Generation

Log out features for each of the above queries out to a training set file

In [None]:
from ltr.log import judgments_to_training_set
trainingSet = judgments_to_training_set(client, 
                                        judgmentInFile='data/title_judgments.txt', 
                                        trainingOutFile='data/title_judgments_train.txt', 
                                        featureSet='title')

## Feature Search: which features work best?

What combination of these features work best? Train a model with every combination, and use k-fold cross valudation (see `kcv=15` below). The combination with the best NDCG is output

In [None]:
from ltr.train import feature_search
rankLibResult, ndcgPerFeature = feature_search(client,
                                               trainingInFile='data/title_judgments_train.txt',
                                               metric2t='NDCG@10',
                                               leafs=20,
                                               trees=20,
                                               kcv=15,
                                               features=[1,2,3,4,5,6,7],
                                               featureSet='title')

print()
print("Impact of each feature on the model")
trainLogs = rankLibResult.trainingLogs
for ftrId, impact in trainLogs[-1].impacts.items():
    print("{} - {}".format(ftrId, impact))
    
for roundDcg in trainLogs[-1].rounds:
    print(roundDcg)
    
print("Avg NDCG@10 when feature included:")
for ftrId, ndcg in ndcgPerFeature.items():
    print("%s => %s" % (ftrId, ndcg))
    
print("Avg K-Fold NDCG@10 %s" % rankLibResult.kcvTestAvg)

## Compare to model w/ all features

Compare the features output above (something like...)

```
Impact of each feature on the model
7 - 17618.35445148437
4 - 16165.586045512271
3 - 10958.610341321868
5 - 9256.821192289186
1 - 1436.0640878600943
```

to one trained with the full model. Notice how features have different impacts. This is due to feature dependency

In [None]:
from ltr import train
trainLog  = train(client,
                  trainingInFile='data/title_judgments_train.txt',
                  metric2t='NDCG@10',
                  leafs=20,
                  trees=20,
                  features=[1,2,3,4,5,6,7],
                  featureSet='title',
                  index='tmdb',
                  modelName='title')

print()
print("Impact of each feature on the model")
for ftrId, impact in trainLog.impacts.items():
    print("{} - {}".format(ftrId, impact))
    
for roundDcg in trainLog.rounds:
    print(roundDcg)
    
print("Train NDCG@10 %s" % trainLog.rounds[-1])

## Bias towards fewer features

By adding a 'cost', to feature search, we add a multiplier that punishes models with more features slightly. This results in a tiny bias towards simpler models all things being equal. As we'd prefer one that doesn't need to execute more features

In [None]:
from ltr.train import feature_search
rankLibResult, ndcgPerFeature = feature_search(client,
                                               trainingInFile='data/title_judgments_train.txt',
                                               metric2t='NDCG@10',
                                               leafs=20,
                                               trees=20,
                                               kcv=15,
                                               featureCost=0.1,# adjustedNDCG = NDCG * ( (1.0-cost) ^ num_features)
                                               features=[1,2,3,4,5,6,7],
                                               featureSet='title')

print()
print("Impact of each feature on the model")
trainLogs = rankLibResult.trainingLogs
for ftrId, impact in trainLogs[-1].impacts.items():
    print("{} - {}".format(ftrId, impact))
    
for roundDcg in trainLogs[-1].rounds:
    print(roundDcg)
    
print("Avg NDCG@10 when feature included:")
for ftrId, ndcg in ndcgPerFeature.items():
    print("%s => %s" % (ftrId, ndcg))
    
print("Avg K-Fold NDCG@10 %s" % rankLibResult.kcvTestAvg)

# Evaluating the Model

It's interesting to see what features our model makes use of, but we need guidance on adding additional features to the model. We know our model is an ensemble of decision trees. Wouldn't it be cool if we could trace where documents end up on that decision tree?

Specifically, we care about problems. Or what we will call affectionately *whoopsies*. 

As a 'whoopsie' example, consider the query "Rambo". if a '0' document like 'First Daughter' ranked the same or higher than a '4' document ("Rambo")., that's a problem. It's also an opportunity for improvement. We'd want to isolate that, see if it's indicative of a broader trend, and thus worth adding a feature for.

Let's see a concrete example

In [None]:
from ltr.MART_model import eval_model
from ltr.judgments import judgments_from_file, judgments_by_qid

features, _ = client.feature_set(index='tmdb', name='title')

judgmentDict = judgments_by_qid(judgments_from_file(filename='data/title_judgments_train.txt'))


rambo=judgmentDict[1]
model = eval_model(modelName='title',
                       features=features,
                       judgments=rambo)

print()
print("## Evaluating graded docs for search keywords '%s'" % rambo[0].keywords)
print()
print(model)

## Examining our evaluation for whoopsies

Let's looks at one tree in our ensemble, te see how it was evaluated.

```
if title_bm25 > 10.664251:
  if title_phrase > 0.0:
    if title_bm25 > 13.815164:
      if release_year > 2000.0:
        <= 0.1215(0/0/)
      else:
        <= 0.1240(0/0/)
    else:
      if title_bm25 > 10.667803:
        if overview_bm25 > 0.0:
          <= 0.1194(0/0/)
        else:
          <= 0.1161(1/0/)
      else:
        <= 0.1264(0/0/)
  else:
    <= 0.0800(0/0/)
else:
  if title_phrase > 0.0:
    if title_bm25 > 8.115499:
      if title_bm25 > 8.217656:
        <= 0.1097(2/1/qid:40:2(12180)-3(140607))
      else:
        <= 0.1559(0/0/)
    else:
      <= -0.0021(2/1/qid:40:2(1895)-3(330459))
  else:
    <= -0.1093(25/1/qid:40:0(85783)-3(1892))
```

You'll notice here this tree is represented by a series of if statements, where the feature's name is used. This is handy as it lets us take apart the structure of the tree.

You'll also notice the leaf nodes starting with 

```
<=
```

These leaf nodes have a floating point value, corresponding to the relevance score that documents ending up here will have. Each leaf also has three items in paranthesis, such as `(2/1/qid:40:2(1895)-3(330459))`. This is a report summarizing the result of evaluating the tree on the provided judgment list. Indicating:


```



   +--- 2 Documents evaluated to this leaf node                   +-- max grade doc eval'd to this leaf
   |                                                              |
   | +----- 1 'whoopsie' occured                                  |  +-- corresp. doc id of max doc
   | |                                                            |  |
   | |   +--- details on each whoopsie ----------- qid:40:2(1985)-3(330459)
   | |   |                                              | |  |
  (2/1/qid:40:2(1895)-3(330459))                        | |  |
                                                        | |  + doc id of min graded doc
                                                        | |
                                                        | + min grade of docs eval'd to this leaf
                                                        |
                                                        + query id of whoopsie from judgments
```


Looking at Star Wars, our biggest issues in this tree are with the bottom-most leaf. Here

```
if title_bm25 > 10.664251:
  ...
else:
  if title_phrase > 0.0:
    ...
  else:
    <= -0.1093(25/1/qid:40:0(85783)-3(1892))
```


Document 85783 (a '0') and doc 1892 are given the same grade.

### Whoopsie, from the query perspective

Whoopsies can also be examined at the "query" level to see for a query id, how many whoopsies existed, and what was the evaluation for that query at each tree. This can help see if an error was fixed later in the ensemble of trees.

In [None]:
whoopsies = model.whoopsies()
for qid, whoopsie in whoopsies.items():
    print("== QID %s ==" % qid)
    print("%s - %s" % (whoopsie.count, whoopsie.totalMagnitude))
    print(whoopsie.perTreeReport())

## Examine problem doc 319074

(notice nothing mentions 'star wars')

In [None]:


client.get_doc(index='tmdb', doc_id=1368)

## Add a feature: collection name

We have an intuition about our data, there is a field for the movies "collection name". See it here below:

In [None]:
from ltr.helpers.movies import get_movie
get_movie(1368)

## Now reindex with collection name...

We'll add collection name, and reindex.

In [None]:
def add_collection_and_char_name(src_movie, base_doc):
    if 'belongs_to_collection' in src_movie and src_movie['belongs_to_collection'] is not None:
        if 'name' in src_movie['belongs_to_collection']:
            base_doc['collection_name_en'] = src_movie['belongs_to_collection']['name']
            
    if 'cast' in src_movie and src_movie['cast'] is not None:
        characters = [cast['character'] for cast in get_movie(1368)['cast']][:5]
        base_doc['top_cast_en'] = characters
    return base_doc

from ltr.index import rebuild_tmdb
rebuild_tmdb(client, enrich=add_collection_and_char_name)

Confirm it's in our doc now...

In [None]:
client.get_doc(index='tmdb', doc_id=319074)

## Add it to the features, and regenerate training data....

In [None]:
config = [
    #1
    {
      "name" : "title_has_phrase",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:\"${keywords})\"^=1"
      }
    },
    #2
    {
      "name" : "title_has_terms",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})^=1"
      }
    },
    #3
    {
      "name" : "title_bm25",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})"
      }
    },
    #4
    {
      "name" : "overview_bm25",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    #5
    {
      "name" : "overview_phrase_bm25",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:\"${keywords}\""
      }
    },
    #6
    {
      "name" : "title_fuzzy",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!lucene df=title}${keywords}~"
      }
    },
    #7
    {
      "name" : "release_year",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}def(release_year,2000)"
      }
    },
    #8 Collection Name BM25 Score
    {
      "name" : "coll_name_bm25",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "collection_name_en:(${keywords})"
      }
    },
    #9 Collection Name Phrase BM25 Score
    {
      "name" : "coll_name_phrase_bm25",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "collection_name_en:\"${keywords}\""
      }
    }

]




from ltr import setup
setup(client, config=config, index='tmdb', featureset='title2')

from ltr.log import judgments_to_training_set
trainingSet = judgments_to_training_set(client, 
                                        judgmentInFile='data/title_judgments.txt', 
                                        trainingOutFile='data/title2_judgments_train.txt', 
                                        featureSet='title2')

## Now a feature search

And do a feature search over these new features (go get some coffee).

We also up the number of trees & leafs to see if it has an impact

In [None]:
from ltr.train import feature_search
rankLibResult, ndcgPerFeature = feature_search(client,
                                               trainingInFile='data/title2_judgments_train.txt',
                                               metric2t='NDCG@10',
                                               leafs=20,
                                               trees=20,
                                               kcv=15,
                                               features=[1,2,3,4,5,6,7,8,9],
                                               featureSet='title2')

print()
print("Impact of each feature on the model")
trainLogs = rankLibResult.trainingLogs
for ftrId, impact in trainLogs[-1].impacts.items():
    print("{} - {}".format(ftrId, impact))
    
for roundDcg in trainLogs[-1].rounds:
    print(roundDcg)
    
print("Avg NDCG@10 when feature included:")
for ftrId, ndcg in ndcgPerFeature.items():
    print("%s => %s" % (ftrId, ndcg))
    
print("Avg K-Fold NDCG@10 %s" % rankLibResult.kcvTestAvg)

## Review new feature impacts

Impact of each feature on the model... this is the best mix. Feature 8 helps, but not feature 9 as much. Interesting

```
4 - 18032.527656827504
3 - 9801.409052757816
5 - 8051.741259194476
7 - 5711.964176322393
8 - 3798.6132329430748
1 - 1439.2180228991883
```

## Now save away this model

In [None]:
from ltr import train
trainLog  = train(client,
                  trainingInFile='data/title2_judgments_train.txt',
                  metric2t='NDCG@10',
                  leafs=20,
                  trees=20,
                  features=[1,3,4,5,7,8],
                  featureSet='title2',
                  index='tmdb',
                  modelName='title2')

print()
print("Impact of each feature on the model")
for ftrId, impact in trainLog.impacts.items():
    print("{} - {}".format(ftrId, impact))
    
for roundDcg in trainLog.rounds:
    print(roundDcg)
    
print("Train NDCG@10 %s" % trainLog.rounds[-1])

In [None]:
from ltr import search
search(client, "rambo", modelName='title2')

## Examine Model 2

In [None]:
from ltr.MART_model import eval_model
from ltr.judgments import judgments_from_file, judgments_by_qid

features, _ = client.feature_set(index='tmdb', name='title2')

judgmentDict = judgments_by_qid(judgments_from_file(filename='data/title2_judgments_train.txt'))


rambo=judgmentDict[1]
model = eval_model(modelName='title2',
                       features=features,
                       judgments=rambo)

print()
print("## Evaluating graded docs for search keywords '%s'" % rambo[0].keywords)
print()
print(model)

In [None]:
whoopsies = model.whoopsies()
for qid, whoopsie in whoopsies.items():
    print("== QID %s ==" % qid)
    print("%s - %s" % (whoopsie.count, whoopsie.totalMagnitude))
    print(whoopsie.perTreeReport())

```
== QID 1 ==
10 - 40
tree:0=>0(319074)-4(1368);tree:1=>0(319074)-4(1368);tree:2=>0(319074)-4(1368);tree:3=>0(319074)-4(1368);tree:4=>0(319074)-4(1368);tree:5=>0(319074)-4(1368);tree:6=>0(319074)-4(1368);tree:7=>0(319074)-4(1368);tree:8=>0(319074)-4(1368);tree:9=>0(319074)-4(1368)
```

In [None]:
from ltr.helpers.movies import get_movie
[cast['character'] for cast in get_movie(1368)['cast']][:5]

In [None]:
def add_collection_and_char_name(src_movie, base_doc):
    if 'belongs_to_collection' in src_movie and src_movie['belongs_to_collection'] is not None:
        if 'name' in src_movie['belongs_to_collection']:
            base_doc['collection_name_en'] = src_movie['belongs_to_collection']['name']
            
    if 'cast' in src_movie and src_movie['cast'] is not None:
        characters = [cast['character'] for cast in get_movie(1368)['cast']][:5]
        base_doc['top_chars_en'] = characters
    return base_doc

from ltr.index import rebuild_tmdb
rebuild_tmdb(client, enrich=add_collection_and_char_name)

In [None]:
client.get_doc(index='tmdb', doc_id=1892)

In [None]:
#4 - 18032.527656827504
#3 - 9801.409052757816
#5 - 8051.741259194476
#7 - 5711.964176322393
#8 - 3798.6132329430748
#1 - 1439.2180228991883

config = [
    #1
    {
      "name" : "title_has_phrase",
      "store": "title_doug",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:\"${keywords})\"^=1"
      }
    },
    #2 (old 3)
    {
      "name" : "title_bm25",
      "store": "title_doug",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})"
      }
    },
    #3 (old 4)
    {
      "name" : "overview_bm25",
      "store": "title_doug",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    #4 (old 5)
    {
      "name" : "overview_phrase_bm25",
      "store": "title_doug",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:\"${keywords}\""
      }
    },
    #5 (old 7)
    {
      "name" : "release_year",
      "store": "title_doug",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}def(release_year,2000)"
      }
    },
    #6 (old 8) Collection Name BM25 Score
    {
      "name" : "coll_name_bm25",
      "store": "title_doug",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "collection_name_en:(${keywords})"
      }
    },
    #7 pf2
    {
      "name" : "top_chars_phrase_bm25",
      "store": "title_doug",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "top_chars_en:\"${keywords}\""
      }
    },
    #8 pf2
    {
      "name" : "top_chars_bm25",
      "store": "title_doug",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "top_chars_en:(${keywords})"
      }
    }

]




from ltr import setup
setup(client, config=config, index='tmdb', featureset='title_doug')

from ltr.log import judgments_to_training_set
trainingSet = judgments_to_training_set(client, 
                                        judgmentInFile='data/title_judgments.txt', 
                                        trainingOutFile='data/title_doug_judgments_train.txt', 
                                        featureSet='title_doug')

In [None]:
from ltr.train import feature_search
rankLibResult, ndcgPerFeature = feature_search(client,
                                               trainingInFile='data/title_doug_judgments_train.txt',
                                               metric2t='NDCG@10',
                                               leafs=20,
                                               trees=20,
                                               kcv=5,
                                               features=[1,2,3,4,5,6,7,8],
                                               featureSet='title_doug')

print()
print("Impact of each feature on the model")
trainLogs = rankLibResult.trainingLogs
for ftrId, impact in trainLogs[-1].impacts.items():
    print("{} - {}".format(ftrId, impact))
    
for roundDcg in trainLogs[-1].rounds:
    print(roundDcg)
    
print("Avg NDCG@10 when feature included:")
for ftrId, ndcg in ndcgPerFeature.items():
    print("%s => %s" % (ftrId, ndcg))
    
print("Avg K-Fold NDCG@10 %s" % rankLibResult.kcvTestAvg)