In [1]:
import requests
import json

from ltr.client.solr_client import SolrClient

client = SolrClient()
host = client.get_host()

## Download, reindex...

1. Download the corpus & judgments
2. Rebuild the index from the tmdb solr config
3. Reindex movies loaded from the corpus

In [2]:
from ltr import download

tmdb_corpus='http://es-learn-to-rank.labs.o19s.com/tmdb_ai_pow_search.json'
judgments='http://es-learn-to-rank.labs.o19s.com/title_judgments_binary.txt'
download([tmdb_corpus, judgments], dest='data/');

data/tmdb_ai_pow_search.json already exists
data/title_judgments_binary.txt already exists


In [None]:
from ltr.index import rebuild
from ltr.helpers.movies import indexable_movies
movies=indexable_movies(movies='data/tmdb_ai_pow_search.json')
rebuild(client, index='tmdb', doc_src=movies)

## Manual boosting

One 'generalizable' relevance solution that gets at the long tail is a manually derivved relevance function

In [3]:
q="""title:({keywords})^10
     overview:({keywords})^20
     {{!func}}release_year^0.01"""

q = q.format(keywords='mark zuckerberg college')

solr_q = {'defType': 'edismax',
          'fl': 'title ',
          'q': q}

client.query(index='tmdb', query=solr_q)

[{'title': ['The Social Network']},
 {'title': ['Waxwork']},
 {'title': ['Mark Twain']},
 {'title': ['College Girls']},
 {'title': ['College']},
 {'title': ['Six: The Mark Unleashed']},
 {'title': ['Mark Shoots First']},
 {'title': ['College Swing']},
 {'title': ['The Adventures of Mark Twain']},
 {'title': ['College Humor']}]

In [4]:
q="""title:({keywords})^{ti_bm25_weight}
     overview:({keywords})^{ov_bm25_weight}
     {{!func}}release_year^{release_year_weight}"""

q = q.format(ti_bm25_weight=10,
             ov_bm25_weight=20,
             release_year_weight=0.01,
             keywords='mark zuckerberg college')

solr_q = {'defType': 'edismax',
          'fl': 'title',
          'q': q}

client.query(index='tmdb', query=solr_q)

[{'title': ['The Social Network']},
 {'title': ['Waxwork']},
 {'title': ['Mark Twain']},
 {'title': ['College Girls']},
 {'title': ['College']},
 {'title': ['Six: The Mark Unleashed']},
 {'title': ['Mark Shoots First']},
 {'title': ['College Swing']},
 {'title': ['The Adventures of Mark Twain']},
 {'title': ['College Humor']}]

In [5]:
from ltr.helpers.movies import get_movie
from ltr.judgments import judgments_from_file

def judg_csv(judgment):
    judgedMovie = get_movie(movies='data/tmdb_ai_pow_search.json', tmdb_id=judgment.docId)
    return "{grade},'{title}',{keywords}".format(grade=judgment.grade,
                                                 title=judgedMovie['title'],
                                                 keywords=judgment.keywords)

# Make a baby judgment list for book display

to_sample={1:[0,1,6,9], # qid->rows in qid to sample
           2:[0,1,12]}

mini_judg_list=[]

from itertools import groupby
judgment_dict={}
with open('data/title_judgments_binary.txt') as f:
    for qid, query_judgments in groupby(judgments_from_file(f), key=lambda j: j.qid):
        if qid in to_sample.keys():
            query_judgments = [j for j in query_judgments]
            for row in to_sample[qid]:
                mini_judg_list.append(query_judgments[row])

mini_judg_list

Recognizing 65 queries...


[Judgment(grade=1,qid=1,keywords=rambo,docId=7555,features=[],weight=1,
 Judgment(grade=1,qid=1,keywords=rambo,docId=1370,features=[],weight=1,
 Judgment(grade=0,qid=1,keywords=rambo,docId=61410,features=[],weight=1,
 Judgment(grade=0,qid=1,keywords=rambo,docId=35868,features=[],weight=1,
 Judgment(grade=1,qid=2,keywords=rocky,docId=1366,features=[],weight=1,
 Judgment(grade=1,qid=2,keywords=rocky,docId=1374,features=[],weight=1,
 Judgment(grade=0,qid=2,keywords=rocky,docId=21501,features=[],weight=1]

In [6]:
def judg_csv(judgment):
    judgedMovie = get_movie(movies='data/tmdb_ai_pow_search.json', tmdb_id=judgment.docId)
    return "{grade},'{title}',{keywords}".format(grade=judgment.grade,
                                                 title=judgedMovie['title'],
                                                 keywords=judgment.keywords)

### As CSV

In [7]:
for judgment in mini_judg_list:
    print(judg_csv(judgment))

1,'Rambo',rambo
1,'Rambo III',rambo
0,'Spud',rambo
0,'Green Dragon',rambo
1,'Rocky',rocky
1,'Rocky IV',rocky
0,'Christmas Story',rocky


### Dump the file...

In [8]:
from ltr.judgments import judgments_to_file
from io import StringIO

string_f = StringIO()
judgments_to_file(string_f, judgmentsList=mini_judg_list)

print(string_f.getvalue())

# qid:1: rambo*1
# qid:2: rocky*1

1	qid:1	 # 7555	rambo
1	qid:1	 # 1370	rambo
0	qid:1	 # 61410	rambo
0	qid:1	 # 35868	rambo
1	qid:2	 # 1366	rocky
1	qid:2	 # 1374	rocky
0	qid:2	 # 21501	rocky



### Same plausible features on each

In [11]:
# Setup some features for this dummy dataset
client.reset_ltr(index='tmdb')

ftr_config = [
    #1
    {
      "name" : "title_bm25",
      "store": "dummy",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})"
      }
    },
    #2
    {
      "name" : "overview_bm25",
      "store": "dummy",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    {#3
      "name" : "release_year",
      "store": "dummy",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}def(release_year,2000)"
      }
    }

]


judgments_string=string_f.getvalue()
client.create_featureset(index='tmdb', name='dummy', ftr_config=ftr_config)

from ltr.judgments import judgments_reader
from ltr.log import FeatureLogger

ftr_logger=FeatureLogger(client, index='tmdb', feature_set='dummy')
with judgments_reader(StringIO(judgments_string)) as judgments:
    for qid, query_judgments in groupby(judgments, key=lambda j: j.qid):
        ftr_logger.log_for_qid(qid=qid,
                               keywords=judgments.keywords(qid),
                               judgments=query_judgments)

Deleted dummy Featurestore [Status: 200]
Created dummy feature store under tmdb: [Status: 200]
Recognizing 2 queries...
Searching tmdb [Status: 200]
Discarded 0 Keep 4
Searching tmdb [Status: 200]
Discarded 0 Keep 3


## Dump the training set

In [13]:
from ltr.judgments import judgments_writer
from io import StringIO

string_f = StringIO()
with judgments_writer(string_f) as writer:
    for j in ftr_logger.logged:
        writer.write(j)

print(string_f.getvalue())

# qid:1: rambo*1
# qid:2: rocky*1

1	qid:1	1:13.038148	2:11.173398	3:2008.0 # 7555	rambo
1	qid:1	1:11.056428	2:12.652582	3:1988.0 # 1370	rambo
0	qid:1	1:0.0	2:4.6697874	3:2010.0 # 61410	rambo
0	qid:1	1:0.0	2:0.0	3:2001.0 # 35868	rambo
1	qid:2	1:11.020994	2:8.675259	3:1976.0 # 1366	rocky
1	qid:2	1:9.345869	2:7.304178	3:1985.0 # 1374	rocky
0	qid:2	1:0.0	2:0.0	3:2007.0 # 21501	rocky

