# Setup

In [43]:
#import
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import pickle

In [44]:
#initialize
es = Elasticsearch()

In [45]:
movies=pickle.load(open("../movies.p","rb"))

# Search Completion

## Completions from the Documents Being Searched

In [46]:
try:
    es.indices.delete(index="tmdb")
except:
    pass

#create index here?
# genres.name needs to be keyword tokenized so that 'science fiction' doesn't get split on white space
# maybe create a text field with title and overview to search against
body = {
    "settings": {
    "analysis": {
      "filter": {
        "shingle_2": {
          "type":"shingle",
          "output_unigrams":"false"}},
      "analyzer": {
        "completion_analyzer": {
          "tokenizer":  "standard",
          "filter": [
            "lowercase", 
            "shingle_2"]}}}},
    "mappings": {
        "properties": {
          "genres": {
            "properties": {
              "name": { 
                "type": "text",
                "index": "false"}}},
          "title": {
            "type": "text",
            "analyzer": "english",
            "copy_to":["completion"]},
          "completion": {
            "type": "text",
            "fielddata": "true",
            "analyzer": "completion_analyzer"}}}}
es.indices.create(index="tmdb",settings=body['settings'],mappings=body['mappings'])

{u'acknowledged': True, u'index': u'tmdb', u'shards_acknowledged': True}

In [47]:
#doc indexer
def format_doc(doc):
    action = {
        "_index": "tmdb",
        "_id": doc['id'],
        "_source": doc
        }
    return action

def index_movies():
    actions = (format_doc(doc) for doc in movies)
    results = [details for success,details in helpers.streaming_bulk(es, actions) if not success]
    return results

In [48]:
results = index_movies()

In [96]:
def get_completion_query(input_string):
    
    query_body = {
    "_source": ["title"],
    "query" : {
        "match_phrase_prefix" : {
            "title" : {
                "query" : input_string}}}}

    #if the input string is too short, then don't attempt completion
    if len(input_string) < 2:
        return query_body
    
    #get the last uncompleted string
    input_string = input_string.lstrip()
    last_space_index = input_string.rfind(' ')
    prefix = input_string[last_space_index+1:]
    
    #if the prefix is 1 or less chars then include the previous word in the prefix
    if len(prefix) <= 1:
        previous_space_index = input_string[:last_space_index].rfind(' ')
        prefix = input_string[previous_space_index+1:]
        
    print 'prefix %s' % prefix
    query_body['aggs'] = {
        'completion': {
            'terms': {
                'field':'completion',
                 "size": 10000,
                'include': '%s.*' % prefix
                #'include': '.*'
            }
        }
    }
    return query_body
    



query_body = get_completion_query("lost in")
print str(query_body)
es.search(index="tmdb",body=query_body)

prefix in
{'query': {'match_phrase_prefix': {'title': {'query': 'lost in'}}}, 'aggs': {'completion': {'terms': {'field': 'completion', 'include': 'in.*', 'size': 10000}}}, '_source': ['title']}




{u'_shards': {u'failed': 0, u'skipped': 0, u'successful': 1, u'total': 1},
 u'aggregations': {u'completion': {u'buckets': [{u'doc_count': 1,
     u'key': u'in new'}],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 0}},
 u'hits': {u'hits': [{u'_id': u'85',
    u'_index': u'tmdb',
    u'_score': 4.2908716,
    u'_source': {u'title': u'Raiders of the Lost Ark'},
    u'_type': u'_doc'},
   {u'_id': u'330',
    u'_ignored': [u'overview.keyword'],
    u'_index': u'tmdb',
    u'_score': 3.7148085,
    u'_source': {u'title': u'The Lost World: Jurassic Park'},
    u'_type': u'_doc'},
   {u'_id': u'772',
    u'_ignored': [u'overview.keyword'],
    u'_index': u'tmdb',
    u'_score': 2.9284902,
    u'_source': {u'title': u'Home Alone 2: Lost In New York'},
    u'_type': u'_doc'}],
  u'max_score': 4.2908716,
  u'total': {u'relation': u'eq', u'value': 3}},
 u'timed_out': False,
 u'took': 1}

## Fast Completions Via Specialized Search Indices

In [139]:
try:
    es.indices.delete("tmdb")
except:
    pass

#create index here?
# genres.name needs to be keyword tokenized so that 'science fiction' doesn't get split on white space
# maybe create a text field with title and overview to search against
body = {
"settings": {
    "analysis": {
        "analyzer": {
          "my_analyzer": {
            "tokenizer": "my_tokenizer",
            "filter": ["lowercase"]
          }
        },
        "tokenizer": {
          "my_tokenizer": {
            "type": "path_hierarchy",
            "delimiter": " ",
            "reverse": "true"
          }
        }
    }
},
    "mappings": {
        "properties": {
          "genres": {
            "properties": {
              "name": { 
                "type": "text",
                "index": "false"}}},
          "title": {
            "type": "text",
            "analyzer": "english"},
          "completion": {
            "type": "completion",
              "analyzer":"my_analyzer"
          }}}}
es.indices.create(index="tmdb",mappings=body['mappings'],settings=body['settings'])

  


{u'acknowledged': True, u'index': u'tmdb', u'shards_acknowledged': True}

In [140]:
#doc indexer
def format_doc(doc):
    doc["completion"] = {
        "input": [doc["title"]],
        "weight": int(doc["popularity"]*100)
    }
    action = {
        "_index": "tmdb",
        "_id": doc['id'],
        "_source": doc
        }
    return action

def index_movies():
    actions = (format_doc(doc) for doc in movies)
    results = [details for success,details in helpers.streaming_bulk(es, actions) if not success]
    return results

In [141]:
results = index_movies()

In [142]:

        
text="war"
suggest_dictionary = {"my-entity-suggest" : {
                      'text' : text,
                      "completion" : {
                          "field" : "completion"
                      }
                    }
                  }
query_dictionary = {'suggest' : suggest_dictionary}

es.search(index="tmdb",body=query_dictionary)

  del sys.path[0]


{u'_shards': {u'failed': 0, u'skipped': 0, u'successful': 1, u'total': 1},
 u'hits': {u'hits': [],
  u'max_score': None,
  u'total': {u'relation': u'eq', u'value': 0}},
 u'suggest': {u'my-entity-suggest': [{u'length': 3,
    u'offset': 0,
    u'options': [{u'_id': u'72190',
      u'_ignored': [u'overview.keyword'],
      u'_index': u'tmdb',
      u'_score': 461.0,
      u'_source': {u'adult': False,
       u'backdrop_path': u'/xMOQVYLeIKBXenJ9KMeasj7S64y.jpg',
       u'belongs_to_collection': None,
       u'budget': 200000000,
       u'completion': {u'input': [u'World War Z'], u'weight': 461},
       u'genres': [{u'id': 28, u'name': u'Action'},
        {u'id': 18, u'name': u'Drama'},
        {u'id': 27, u'name': u'Horror'},
        {u'id': 878, u'name': u'Science Fiction'},
        {u'id': 53, u'name': u'Thriller'}],
       u'homepage': u'http://www.worldwarzmovie.com',
       u'id': 72190,
       u'imdb_id': u'tt0816711',
       u'original_language': u'en',
       u'original_title': u

# Post-Search Suggest

In [4]:
try:
    es.indices.delete("tmdb")
except:
    pass

#create index here?
# genres.name needs to be keyword tokenized so that 'science fiction' doesn't get split on white space
# maybe create a text field with title and overview to search against
body = {
    "mappings": {
      "movie": {
        "properties": {
          "genres": {
            "properties": {
              "name": { 
                "type": "string",
                "index": "not_analyzed"}}},
          "title": {
            "type": "string",
            "analyzer": "english",
            "copy_to":["suggestion"]},
          "suggestion": {
            "type": "string"}}}}}
es.indices.create("tmdb",body=body)

{u'acknowledged': True}

In [5]:
#doc indexer
def format_doc(doc):
    action = {
        "_index": "tmdb",
        "_type": "movie",
        "_id": doc['id'],
        "_source": doc
        }
    return action

def index_movies():
    actions = (format_doc(doc) for doc in movies)
    results = [details for success,details in helpers.streaming_bulk(es, actions) if not success]
    return results

In [6]:
results = index_movies()

In [7]:
suggest_body = { "title_suggestion": {
    "text": "star trec",
    "phrase": {
        "field": "suggestion"}}}
        
es.suggest(index="tmdb",body=suggest_body)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'title_suggestion': [{u'length': 9,
   u'offset': 0,
   u'options': [],
   u'text': u'star trec'}]}

In [9]:
# Works in Elasticsearch 1.5
query_body = { 
  "fields": ["title"],
  "query": {
    "match": {"title":"star trec"}},
  "suggest": { "title_completion": {
    "text": "star trec",
    "phrase": {
      "field": "suggestion",
      "max_errors": 2,
      "collate": {
        "query": { 
          "match_phrase": {
            "title" : "{{suggestion}}"
          }
        }}}}}}
        
es.search(index="tmdb",body=query_body,size=2)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'hits': {u'hits': [{u'_id': u'13475',
    u'_index': u'tmdb',
    u'_score': 0.83896446,
    u'_type': u'movie',
    u'fields': {u'title': [u'Star Trek']}},
   {u'_id': u'222935',
    u'_index': u'tmdb',
    u'_score': 0.68552226,
    u'_type': u'movie',
    u'fields': {u'title': [u'The Fault in Our Stars']}}],
  u'max_score': 0.83896446,
  u'total': 9},
 u'suggest': {u'title_completion': [{u'length': 9,
    u'offset': 0,
    u'options': [{u'score': 0.015584747, u'text': u'star trek'}],
    u'text': u'star trec'}]},
 u'timed_out': False,
 u'took': 168}

In [11]:
# Works in Elasticsearch 2.1
query_body = { 
  "fields": ["title"],
  "query": {
    "match": {"title":"star trec"}},
  "suggest": { "title_completion": {
    "text": "star trec",
    "phrase": {
      "field": "suggestion",
      "max_errors": 2,
      "collate": {
        "query": { 
          "inline" : {
            "match_phrase": {
              "title" : "{{suggestion}}"
}}}}}}}}

es.search(index="tmdb",body=query_body,size=2)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'hits': {u'hits': [{u'_id': u'13475',
    u'_index': u'tmdb',
    u'_score': 0.83896446,
    u'_type': u'movie',
    u'fields': {u'title': [u'Star Trek']}},
   {u'_id': u'222935',
    u'_index': u'tmdb',
    u'_score': 0.68552226,
    u'_type': u'movie',
    u'fields': {u'title': [u'The Fault in Our Stars']}}],
  u'max_score': 0.83896446,
  u'total': 9},
 u'suggest': {u'title_completion': [{u'length': 9,
    u'offset': 0,
    u'options': [{u'score': 0.015584747, u'text': u'star trek'}],
    u'text': u'star trec'}]},
 u'timed_out': False,
 u'took': 35}