# Setup

In [1]:
#import
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import pickle

In [2]:
#initialize
es = Elasticsearch()

In [3]:
movies=pickle.load(open("../movies.p","rb"))

# Search Completion

## Completions from the Documents Being Searched

In [4]:
try:
    es.indices.delete("tmdb")
except:
    pass

#create index here?
# genres.name needs to be keyword tokenized so that 'science fiction' doesn't get split on white space
# maybe create a text field with title and overview to search against
body = {
    "settings": {
    "analysis": {
      "filter": {
        "shingle_2": {
          "type":"shingle",
          "output_unigrams":"false"}},
      "analyzer": {
        "completion_analyzer": {
          "tokenizer":  "standard",
          "filter": [
            "standard", 
            "lowercase", 
            "shingle_2"]}}}},
    "mappings": {
      "movie": {
        "properties": {
          "genres": {
            "properties": {
              "name": { 
                "type": "string",
                "index": "not_analyzed"}}},
          "title": {
            "type": "string",
            "analyzer": "english",
            "copy_to":["completion"]},
          "completion": {
            "type": "string",
            "analyzer": "completion_analyzer"}}}}}
es.indices.create("tmdb",body=body)



{u'acknowledged': True}

In [5]:
#doc indexer
def format_doc(doc):
    action = {
        "_index": "tmdb",
        "_type": "movie",
        "_id": doc['id'],
        "_source": doc
        }
    return action

def index_movies():
    actions = (format_doc(doc) for doc in movies)
    results = [details for success,details in helpers.streaming_bulk(es, actions) if not success]
    return results

In [6]:
results = index_movies()

In [7]:
def get_completion_query(input_string):
    
    query_body = {
    "fields": ["title"],
    "query" : {
        "match_phrase_prefix" : {
            "title" : {
                "query" : input_string}}}}

    #if the input string is too short, then don't attempt completion
    if len(input_string) < 3:
        return query_body
    
    #get the last uncompleted string
    input_string = input_string.lstrip()
    last_space_index = input_string.rfind(' ')
    prefix = input_string[last_space_index+1:]
    
    #if the prefix is 1 or less chars then include the previous word in the prefix
    if len(prefix) <= 1:
        previous_space_index = input_string[:last_space_index].rfind(' ')
        prefix = input_string[previous_space_index+1:]
        
    query_body['aggs'] = {
        'completion': {
            'terms': {
                'field':'completion',
                'include': '%s.*' % prefix
            }
        }
    }
    return query_body
    

query_body = get_completion_query("star tr")
print query_body
es.search(index="tmdb",doc_type="movie",body=query_body)

{'fields': ['title'], 'aggs': {'completion': {'terms': {'field': 'completion', 'include': 'tr.*'}}}, 'query': {'match_phrase_prefix': {'title': {'query': 'star tr'}}}}


{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'completion': {u'buckets': [],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 0}},
 u'hits': {u'hits': [{u'_id': u'13475',
    u'_index': u'tmdb',
    u'_score': 2.982868,
    u'_type': u'movie',
    u'fields': {u'title': [u'Star Trek']}}],
  u'max_score': 2.982868,
  u'total': 1},
 u'timed_out': False,
 u'took': 176}

## Fast Completions Via Specialized Search Indices

In [8]:
try:
    es.indices.delete("tmdb")
except:
    pass

#create index here?
# genres.name needs to be keyword tokenized so that 'science fiction' doesn't get split on white space
# maybe create a text field with title and overview to search against
body = {
    "mappings": {
      "movie": {
        "properties": {
          "genres": {
            "properties": {
              "name": { 
                "type": "string",
                "index": "not_analyzed"}}},
          "title": {
            "type": "string",
            "analyzer": "english"},
          "completion": {
            "type": "completion"}}}}}
es.indices.create("tmdb",body=body)

{u'acknowledged': True}

In [9]:
#doc indexer
def format_doc(doc):
    doc["completion"] = {
        "input": [doc["title"]],
        "weight": int(doc["popularity"]*100)
    }
    action = {
        "_index": "tmdb",
        "_type": "movie",
        "_id": doc['id'],
        "_source": doc
        }
    return action

def index_movies():
    actions = (format_doc(doc) for doc in movies)
    results = [details for success,details in helpers.streaming_bulk(es, actions) if not success]
    return results

In [10]:
results = index_movies()

In [11]:
suggest_body = { "title_completion": {
    "text": "star",
    "completion": {
        "field": "completion"}}}
        
es.suggest(index="tmdb",body=suggest_body)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'title_completion': [{u'length': 4,
   u'offset': 0,
   u'options': [{u'score': 312.0,
     u'text': u'Star Wars: Episode IV - A New Hope'},
    {u'score': 298.0, u'text': u'Star Trek Into Darkness'},
    {u'score': 280.0, u'text': u'Star Trek'},
    {u'score': 221.0, u'text': u'Star Wars: Episode I - The Phantom Menace'},
    {u'score': 187.0, u'text': u'Star Wars: Episode VI - Return of the Jedi'}],
   u'text': u'star'}]}

# Post-Search Suggest

In [12]:
try:
    es.indices.delete("tmdb")
except:
    pass

#create index here?
# genres.name needs to be keyword tokenized so that 'science fiction' doesn't get split on white space
# maybe create a text field with title and overview to search against
body = {
    "mappings": {
      "movie": {
        "properties": {
          "genres": {
            "properties": {
              "name": { 
                "type": "string",
                "index": "not_analyzed"}}},
          "title": {
            "type": "string",
            "analyzer": "english",
            "copy_to":["suggestion"]},
          "suggestion": {
            "type": "string"}}}}}
es.indices.create("tmdb",body=body)

{u'acknowledged': True}

In [13]:
#doc indexer
def format_doc(doc):
    action = {
        "_index": "tmdb",
        "_type": "movie",
        "_id": doc['id'],
        "_source": doc
        }
    return action

def index_movies():
    actions = (format_doc(doc) for doc in movies)
    results = [details for success,details in helpers.streaming_bulk(es, actions) if not success]
    return results

In [14]:
results = index_movies()

In [15]:
suggest_body = { "title_suggestion": {
    "text": "star trec",
    "phrase": {
        "field": "suggestion"}}}
        
es.suggest(index="tmdb",body=suggest_body)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'title_suggestion': [{u'length': 9,
   u'offset': 0,
   u'options': [{u'score': 0.011547032, u'text': u'star trek'}],
   u'text': u'star trec'}]}

In [19]:
# Works in Elasticsearch 1.5
query_body = { 
  "fields": ["title"],
  "query": {
    "match": {"title":"star trec"}},
  "suggest": { "title_completion": {
    "text": "star trec",
    "phrase": {
      "field": "suggestion",
      "max_errors": 2,
      "collate": {
        "query": { 
          "match_phrase": {
            "title" : "{{suggestion}}"
          }
        }}}}}}
        
es.search(index="tmdb",body=query_body,size=2)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'hits': {u'hits': [{u'_id': u'13475',
    u'_index': u'tmdb',
    u'_score': 0.70915216,
    u'_type': u'movie',
    u'fields': {u'title': [u'Star Trek']}},
   {u'_id': u'1891',
    u'_index': u'tmdb',
    u'_score': 0.572458,
    u'_type': u'movie',
    u'fields': {u'title': [u'Star Wars: Episode V - The Empire Strikes Back']}}],
  u'max_score': 0.70915216,
  u'total': 9},
 u'suggest': {u'title_completion': [{u'length': 9,
    u'offset': 0,
    u'options': [{u'score': 0.012083458, u'text': u'star trek'}],
    u'text': u'star trec'}]},
 u'timed_out': False,
 u'took': 362}

In [18]:
# Breaks in Elasticsearch 2.1 :-(

query_body = { 
  "fields": ["title"],
  "query": {
    "match": {"title":"star trec"}},
  "suggest": { "title_completion": {
    "text": "star trec",
    "phrase": {
      "field": "suggestion",
      "max_errors": 2,
      "collate": {
        "query": { 
          "match_phrase": {
            "title" : "{{suggestion}}"
          }
        }}}}}}
        
es.search(index="tmdb",body=query_body,size=2)



TransportError: TransportError(500, {u'failed_shards': [{u'node': u'IlIfI3wcS9W769hCXxxvOQ', u'index': u'tmdb', u'reason': {u'reason': u'unexpected field [match_phrase]', u'type': u'script_parse_exception'}, u'shard': 0}], u'root_cause': [{u'reason': u'unexpected field [match_phrase]', u'type': u'script_parse_exception'}], u'grouped': True, u'reason': u'all shards failed', u'phase': u'query', u'type': u'search_phase_execution_exception'})