# Solr Client

In [None]:
from ltr.client import SolrClient
client = SolrClient()

import numpy as np

# Download & Build Index (run once)

If you don't already have the downloaded dependencies; if you don't have TheMovieDB data indexed run this

In [None]:
from ltr import download
download();

In [None]:
from ltr.index import rebuild_tmdb
rebuild_tmdb(client)

## Features for movie titles

We'll be searching movie titles (think searching for a specific movie on Netflix). And we have a set of judgments around the appropriatte movie to return. IE search for "Star Wars" return good star wars matches, in quality order...

These cover various aspects of the problem (searching title by phrase, title bm25 score, release date, etc). We'll use this to explore and analyze a simple model

In [None]:
ftr_config = [
    #1
    {
      "name" : "title_bm25",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})"
      }
    },
    #2
    {
      "name" : "overview_bm25",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    {#3
      "name" : "release_year",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}def(release_year,2000)"
      }
    }

]



from ltr import setup
setup(client, config=ftr_config, index='tmdb', featureset='title')

## Training Set Generation

Log out features for each of the above queries out to a training set file

In [None]:
from ltr.log import judgments_to_training_set
trainingSet = judgments_to_training_set(client, 
                                        judgmentInFile='data/title_judgments_binary.txt', 
                                        trainingOutFile='data/title_judgments_binary_train.txt', 
                                        featureSet='title')

In [None]:
import numpy as np
from ltr.judgments import judgments_from_file, judgments_to_nparray

def pairwise_transform(features, predictors):
    """ Informed by
        https://gist.github.com/agramfort/2071994


        """
    GRADE = 0
    QID = 1

    
    assert features.shape[0] == predictors.shape[0]
    assert predictors.shape[1] == 2
    assert features.shape[1] > 0
    
    num_samples = features.shape[0]
    
    transformed_predictors = []
    transformed_features = []
    
    for i in range(num_samples):
        for j in range(num_samples):
            if (predictors[i][GRADE] != predictors[j][GRADE] and \
                predictors[i][QID] == predictors[j][QID]):
                                
                transformed_predictors.append([predictors[i][GRADE] - predictors[j][GRADE]])
                transformed_features.append(features[i, :] - features[j, :])
    return np.array(transformed_features), np.array(transformed_predictors)

def samples_from_training_data(fname):
    judgs = judgments_from_file(fname)
    features, predictors = judgments_to_nparray(judgs)
    
    # Scale data
    print("Scaling")
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    
    scaler.fit(features)
    features = scaler.transform(features)
        
    print("Pairwise Transform")
    features, predictors = pairwise_transform(features, predictors)
    return features, predictors.ravel(), scaler

features, predictors, scaler = samples_from_training_data(fname='data/title_judgments_binary_train.txt')
features

In [None]:
from sklearn import svm, linear_model
model = svm.LinearSVC(max_iter=1000, verbose=1)
model.fit(features, predictors)

In [None]:
model.coef_

In [None]:
linear_model = {
  "store": "title",
  "class": "org.apache.solr.ltr.model.LinearModel",
  "name": "movie_titles",
  "features": [
  ],
  "params": {
      "weights": {
      }
  }
}

import math
ftr_model = {}
ftr_names = [ftr['name'] for ftr in ftr_config]
for idx, ftr_name in enumerate(ftr_names):
    config = {
        "name": ftr_name,
        "norm": {
            "class": "org.apache.solr.ltr.norm.StandardNormalizer",
            "params": {
                "avg": str(scaler.mean_[idx]),
                "std": str(math.sqrt(scaler.var_[idx]))
            }
        }
    }
    linear_model['features'].append(config)
    linear_model['params']['weights'][ftr_name] =  model.coef_[0][idx] 

linear_model

In [None]:
import json
client.submit_model(featureset='title', 
                    index='tmdb', 
                    model_name='movie_titles', 
                    solr_model=linear_model)

In [None]:
from ltr import search
search(client, keywords='rambo', modelName='movie_titles')