# Solr Client

In [1]:
from ltr.client import SolrClient
client = SolrClient()

import numpy as np

# Download & Build Index (run once)

If you don't already have the downloaded dependencies; if you don't have TheMovieDB data indexed run this

In [2]:
from ltr import download
download();

data/tmdb.json already exists
data/blog.jsonl already exists
data/osc_judgments.txt already exists
data/RankyMcRankFace.jar already exists
data/title_judgments.txt already exists
data/genome_judgments.txt already exists
data/sample_judgments_train.txt already exists
Done.


In [2]:
from ltr.index import rebuild_tmdb
rebuild_tmdb(client)

Reconfig from disk...
Deleted index tmdb [Status: 200]
Created index tmdb [Status: 200]
Reindexing...
Indexed 0 movies (last Black Mirror: White Christmas)
Indexed 100 movies (last Apocalypse Now)
Indexed 200 movies (last Crooks in Clover)
Indexed 300 movies (last For a Few Dollars More)
Indexed 400 movies (last Downfall)
Indexed 500 movies (last Finding Nemo)
Indexed 600 movies (last Platoon)
Indexed 700 movies (last Night of the Living Dead)
Indexed 800 movies (last Evangelion: 1.0: You Are (Not) Alone)
Indexed 900 movies (last Batman: Assault on Arkham)
Indexed 1000 movies (last Riley's First Date?)
Indexed 1100 movies (last The Raid)
Indexed 1200 movies (last Falling Down)
Indexed 1300 movies (last Kal Ho Naa Ho)
Indexed 1400 movies (last Elizabeth)
Indexed 1500 movies (last Irreversible)
Indexed 1600 movies (last Friday Night Lights)
Indexed 1700 movies (last Ben X)
Indexed 1800 movies (last Pump up the Volume)
Indexed 1900 movies (last Armour of God)
Indexed 2000 movies (last Swi

Done [Status: 200]
Indexed 20000 movies (last Left Behind III: World at War)
Indexed 20100 movies (last Dragon Ball Z: Lord Slug)
Indexed 20200 movies (last The Adventures of Sherlock Holmes)
Indexed 20300 movies (last Billy's Hollywood Screen Kiss)
Indexed 20400 movies (last Short Night of Glass Dolls)
Indexed 20500 movies (last Kawa)
Indexed 20600 movies (last Bears)
Indexed 20700 movies (last Pyrates)
Indexed 20800 movies (last Bastard Out of Carolina)
Indexed 20900 movies (last The Mole People)
Indexed 21000 movies (last Till Human Voices Wake Us)
Indexed 21100 movies (last It's a Wonderful Afterlife)
Indexed 21200 movies (last The Bingo Long Traveling All-Stars & Motor Kings)
Indexed 21300 movies (last Ciao! Manhattan)
Indexed 21400 movies (last The Night They Raided Minsky's)
Indexed 21500 movies (last The Girl Can't Help It)
Indexed 21600 movies (last Sam Peckinpah's West: Legacy of a Hollywood Renegade)
Indexed 21700 movies (last A Guy Named Joe)
Indexed 21800 movies (last Odd 

## Features for movie titles

We'll be searching movie titles (think searching for a specific movie on Netflix). And we have a set of judgments around the appropriatte movie to return. IE search for "Star Wars" return good star wars matches, in quality order...

These cover various aspects of the problem (searching title by phrase, title bm25 score, release date, etc). We'll use this to explore and analyze a simple model

In [2]:
ftr_config = [
    #1
    {
      "name" : "title_bm25",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})"
      }
    },
    #2
    {
      "name" : "overview_bm25",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    {#3
      "name" : "release_year",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}def(release_year,2000)"
      }
    }

]



from ltr import setup
setup(client, config=ftr_config, index='tmdb', featureset='title')

Deleted title Featurestore [Status: 200]
Created title feature store under tmdb: [Status: 200]


## Training Set Generation

Log out features for each of the above queries out to a training set file

In [3]:
from ltr.log import judgments_to_training_set
trainingSet = judgments_to_training_set(client, 
                                        judgmentInFile='data/title_judgments_binary.txt', 
                                        trainingOutFile='data/title_judgments_binary_train.txt', 
                                        featureSet='title')

Recognizing 65 queries...
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for rambo (0/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for rocky (1/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for war games (2/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for crocodile dundee (3/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for matrix (4/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for contact (5/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for space jam (6/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for battlestar galactica (7/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for her (8/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for jobs (9/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for social network (10/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for rocky horror (11/65)
Searching tmdb [Status: 200]
REBUILDING TRAINING DATA for shawshank re

In [4]:
import numpy as np
from ltr.judgments import judgments_from_file, judgments_to_nparray

def pairwise_transform(features, predictors):
    """ Informed by
        https://gist.github.com/agramfort/2071994


        """
    GRADE = 0
    QID = 1

    
    assert features.shape[0] == predictors.shape[0]
    assert predictors.shape[1] == 2
    assert features.shape[1] > 0
    
    num_samples = features.shape[0]
    
    transformed_predictors = []
    transformed_features = []
    
    for i in range(num_samples):
        for j in range(num_samples):
            if (predictors[i][GRADE] != predictors[j][GRADE] and \
                predictors[i][QID] == predictors[j][QID]):
                                
                transformed_predictors.append([predictors[i][GRADE] - predictors[j][GRADE]])
                transformed_features.append(features[i, :] - features[j, :])
    return np.array(transformed_features), np.array(transformed_predictors)

def samples_from_training_data(fname):
    judgs = judgments_from_file(fname)
    features, predictors = judgments_to_nparray(judgs)
    
    # Scale data
    print("Scaling")
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    
    scaler.fit(features)
    features = scaler.transform(features)
        
    print("Pairwise Transform")
    features, predictors = pairwise_transform(features, predictors)
    return features, predictors.ravel(), scaler

features, predictors, scaler = samples_from_training_data(fname='data/title_judgments_binary_train.txt')
features

Recognizing 65 queries...
Scaling
Pairwise Transform


array([[ 2.83389957,  0.90158636,  0.04698828],
       [ 2.83389957,  0.65713776,  0.93976563],
       [ 2.83389957,  1.50750909, -0.09397656],
       ...,
       [-2.45946495,  0.00583288, -1.55061329],
       [-2.42642102,  0.01318964, -4.08798048],
       [-2.50627219,  0.01421584, -0.61084766]])

In [5]:
from sklearn import svm, linear_model
model = svm.LinearSVC(max_iter=1000, verbose=1)
model.fit(features, predictors)

[LibLinear]

LinearSVC(verbose=1)

In [6]:
model.coef_

array([[0.41365997, 0.06618855, 0.07113071]])

In [7]:
linear_model = {
  "store": "title",
  "class": "org.apache.solr.ltr.model.LinearModel",
  "name": "movie_titles",
  "features": [
  ],
  "params": {
      "weights": {
      }
  }
}

import math
ftr_model = {}
ftr_names = [ftr['name'] for ftr in ftr_config]
for idx, ftr_name in enumerate(ftr_names):
    config = {
        "name": ftr_name,
        "norm": {
            "class": "org.apache.solr.ltr.norm.StandardNormalizer",
            "params": {
                "avg": str(scaler.mean_[idx]),
                "std": str(math.sqrt(scaler.var_[idx]))
            }
        }
    }
    linear_model['features'].append(config)
    linear_model['params']['weights'][ftr_name] =  model.coef_[0][idx] 

linear_model

{'store': 'title',
 'class': 'org.apache.solr.ltr.model.LinearModel',
 'name': 'movie_titles',
 'features': [{'name': 'title_bm25',
   'norm': {'class': 'org.apache.solr.ltr.norm.StandardNormalizer',
    'params': {'avg': '2.6569321019364898', 'std': '4.346827995987903'}}},
  {'name': 'overview_bm25',
   'norm': {'class': 'org.apache.solr.ltr.norm.StandardNormalizer',
    'params': {'avg': '2.4539911729986588', 'std': '4.059675416378239'}}},
  {'name': 'release_year',
   'norm': {'class': 'org.apache.solr.ltr.norm.StandardNormalizer',
    'params': {'avg': '1993.2558139534883', 'std': '21.281902017834096'}}}],
 'params': {'weights': {'title_bm25': 0.41365997393478904,
   'overview_bm25': 0.06618854827973347,
   'release_year': 0.07113070638739571}}}

In [9]:
import json
client.submit_model(featureset='title', 
                    index='tmdb', 
                    model_name='movie_titles', 
                    solr_model=linear_model)

Deleted Model movie_titles [Status: 200]
Created Model movie_titles [Status: 200]


In [10]:
from ltr import search
search(client, keywords='rambo', modelName='movie_titles')

['Rambo'] 
1.1010934 
2008 
['Action', 'Thriller'] 
["When governments fail to act on behalf of captive missionaries, ex-Green Beret John James Rambo sets aside his peaceful existence along the Salween River in a war-torn region of Thailand to take action.  Although he's still haunted by violent memories of his time as a U.S. soldier during the Vietnam War, Rambo can hardly turn his back on the aid workers who so desperately need his help."] 
---------------------------------------
['Rambo III'] 
0.87011147 
1988 
['Action', 'Adventure', 'Thriller', 'War'] 
["Combat has taken its toll on Rambo, but he's finally begun to find inner peace in a monastery. When Rambo's friend and mentor Col. Trautman asks for his help on a top secret mission to Afghanistan, Rambo declines but must reconsider when Trautman is captured."] 
---------------------------------------
['Rambo: First Blood Part II'] 
0.5296307 
1985 
['Action', 'Adventure', 'Drama', 'Thriller', 'War'] 
["Col. Troutman recruits ex-G