# Setup

In [1]:
#import
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import pickle

In [2]:
#initialize
es = Elasticsearch()

In [3]:
movies=pickle.load(open("../../movies_list.p","rb"))

# Search Completion

## Completions from the Documents Being Searched

In [57]:
try:
    es.indices.delete("tmdb")
except:
    pass

#create index here?
# genres.name needs to be keyword tokenized so that 'science fiction' doesn't get split on white space
# maybe create a text field with title and overview to search against
body = {
    "settings": {
    "analysis": {
      "filter": {
        "shingle_2": {
          "type":"shingle",
          "output_unigrams":"false"}},
      "analyzer": {
        "completion_analyzer": {
          "tokenizer":  "standard",
          "filter": [
            "standard", 
            "lowercase", 
            "shingle_2"]}}}},
    "mappings": {
      "movie": {
        "properties": {
          "genres": {
            "properties": {
              "name": { 
                "type": "string",
                "index": "not_analyzed"}}},
          "title": {
            "type": "string",
            "analyzer": "english",
            "copy_to":["completion"]},
          "completion": {
            "type": "string",
            "analyzer": "completion_analyzer"}}}}}
es.indices.create("tmdb",body=body)

{u'acknowledged': True}

In [3]:
#doc indexer
def format_doc(doc):
    action = {
        "_index": "tmdb",
        "_type": "movie",
        "_id": doc['id'],
        "_source": doc
        }
    return action

def index_movies():
    actions = (format_doc(doc) for doc in movies)
    results = [details for success,details in helpers.streaming_bulk(es, actions) if not success]
    return results

In [58]:
results = index_movies()

In [164]:
def get_completion_query(input_string):
    
    query_body = {
    "fields": ["title"],
    "query" : {
        "match_phrase_prefix" : {
            "title" : {
                "query" : input_string}}}}

    #if the input string is too short, then don't attempt completion
    if len(input_string) < 3:
        return query_body
    
    #get the last uncompleted string
    input_string = input_string.lstrip()
    last_space_index = input_string.rfind(' ')
    prefix = input_string[last_space_index+1:]
    
    #if the prefix is 1 or less chars then include the previous word in the prefix
    if len(prefix) <= 1:
        previous_space_index = input_string[:last_space_index].rfind(' ')
        prefix = input_string[previous_space_index+1:]
        
    query_body['aggs'] = {
        'suggest': {
            'terms': {
                'field':'completion',
                'include': '%s.*' % prefix
            }
        }
    }
    return query_body
    

query_body = get_completion_query("star tr")
print query_body
es.search(index="tmdb",doc_type="movie",body=query_body)

{'fields': ['title'], 'aggs': {'suggest': {'terms': {'field': 'completion', 'include': 'tr.*'}}}, 'query': {'match_phrase_prefix': {'title': {'query': 'star tr'}}}}


{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'suggest': {u'buckets': [{u'doc_count': 1,
     u'key': u'trek 3'},
    {u'doc_count': 1, u'key': u'trek axanar'},
    {u'doc_count': 1, u'key': u'trek first'},
    {u'doc_count': 1, u'key': u'trek generations'},
    {u'doc_count': 1, u'key': u'trek horizon'},
    {u'doc_count': 1, u'key': u'trek ii'},
    {u'doc_count': 1, u'key': u'trek iii'},
    {u'doc_count': 1, u'key': u'trek insurrection'},
    {u'doc_count': 1, u'key': u'trek into'},
    {u'doc_count': 1, u'key': u'trek iv'}],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 4}},
 u'hits': {u'hits': [{u'_id': u'13475',
    u'_index': u'tmdb',
    u'_score': 221.89178,
    u'_type': u'movie',
    u'fields': {u'title': [u'Star Trek']}},
   {u'_id': u'152',
    u'_index': u'tmdb',
    u'_score': 211.32361,
    u'_type': u'movie',
    u'fields': {u'title': [u'Star Trek: The Motion Picture']}},
   {u'_id': u'200',
    u'_index': u'tmdb',


## Fast Completions Via Specialized Search Indices

In [44]:
try:
    es.indices.delete("tmdb")
except:
    pass

#create index here?
# genres.name needs to be keyword tokenized so that 'science fiction' doesn't get split on white space
# maybe create a text field with title and overview to search against
body = {
    "mappings": {
      "movie": {
        "properties": {
          "genres": {
            "properties": {
              "name": { 
                "type": "string",
                "index": "not_analyzed"}}},
          "title": {
            "type": "string",
            "analyzer": "english"},
          "completion": {
            "type": "completion"}}}}}
es.indices.create("tmdb",body=body)

{u'acknowledged': True}

In [45]:
#doc indexer
def format_doc(doc):
    doc["completion"] = {
        "input": [doc["title"]],
        "weight": int(doc["popularity"]*100)
    }
    action = {
        "_index": "tmdb",
        "_type": "movie",
        "_id": doc['id'],
        "_source": doc
        }
    return action

def index_movies():
    actions = (format_doc(doc) for doc in movies)
    results = [details for success,details in helpers.streaming_bulk(es, actions) if not success]
    return results

In [46]:
results = index_movies()

In [54]:
suggest_body = { "title_completion": {
    "text": "star",
    "completion": {
        "field": "completion"}}}
        
es.suggest(index="tmdb",body=suggest_body)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'title_completion': [{u'length': 4,
   u'offset': 0,
   u'options': [{u'score': 326.0,
     u'text': u'Star Wars: Episode IV - A New Hope'},
    {u'score': 273.0, u'text': u'Star Trek'},
    {u'score': 212.0,
     u'text': u'Star Wars: Episode V - The Empire Strikes Back'},
    {u'score': 202.0, u'text': u'Star Wars: Episode VI - Return of the Jedi'},
    {u'score': 194.0, u'text': u'Star Trek Into Darkness'}],
   u'text': u'star'}]}

# Post-Search Suggest

In [13]:
try:
    es.indices.delete("tmdb")
except:
    pass

#create index here?
# genres.name needs to be keyword tokenized so that 'science fiction' doesn't get split on white space
# maybe create a text field with title and overview to search against
body = {
    "mappings": {
      "movie": {
        "properties": {
          "genres": {
            "properties": {
              "name": { 
                "type": "string",
                "index": "not_analyzed"}}},
          "title": {
            "type": "string",
            "analyzer": "english",
            "copy_to":["suggestion"]},
          "suggestion": {
            "type": "string"}}}}}
es.indices.create("tmdb",body=body)

{u'acknowledged': True}

In [14]:
#doc indexer
def format_doc(doc):
    action = {
        "_index": "tmdb",
        "_type": "movie",
        "_id": doc['id'],
        "_source": doc
        }
    return action

def index_movies():
    actions = (format_doc(doc) for doc in movies)
    results = [details for success,details in helpers.streaming_bulk(es, actions) if not success]
    return results

In [15]:
results = index_movies()

In [85]:
suggest_body = { "title_completion": {
    "text": "star trec",
    "phrase": {
        "field": "suggestion"}}}
        
es.suggest(index="tmdb",body=suggest_body)



TransportError: TransportError(500, u'{"_shards":{"total":5,"successful":0,"failed":5,"failures":[{"index":"tmdb","shard":0,"reason":"BroadcastShardOperationFailedException[[tmdb][0] ]; nested: ElasticsearchException[failed to execute suggest]; nested: ElasticsearchIllegalArgumentException[suggester[phrase] doesn\'t support field [fieldd]]; "},{"index":"tmdb","shard":1,"reason":"BroadcastShardOperationFailedException[[tmdb][1] ]; nested: ElasticsearchException[failed to execute suggest]; nested: ElasticsearchIllegalArgumentException[suggester[phrase] doesn\'t support field [fieldd]]; "},{"index":"tmdb","shard":2,"reason":"BroadcastShardOperationFailedException[[tmdb][2] ]; nested: ElasticsearchException[failed to execute suggest]; nested: ElasticsearchIllegalArgumentException[suggester[phrase] doesn\'t support field [fieldd]]; "},{"index":"tmdb","shard":3,"reason":"BroadcastShardOperationFailedException[[tmdb][3] ]; nested: ElasticsearchException[failed to execute suggest]; nested: ElasticsearchIllegalArgumentException[suggester[phrase] doesn\'t support field [fieldd]]; "},{"index":"tmdb","shard":4,"reason":"BroadcastShardOperationFailedException[[tmdb][4] ]; nested: ElasticsearchException[failed to execute suggest]; nested: ElasticsearchIllegalArgumentException[suggester[phrase] doesn\'t support field [fieldd]]; "}]}}')

In [79]:
query_body = { 
  "fields": ["title"],
  "query": {
    "match": {"title":"star trec"}},
  "suggest": { "title_completion": {
    "text": "star trec",
    "phrase": {
      "field": "suggestion",
      "max_errors": 2,
      "collate": {
        "query": { 
          "match_phrase": {
            "title" : "{{suggestion}}" 
          }
        }},
      "highlight": {
        "pre_tag": "<b>",
        "post_tag": "</b>"}}}}}
        
es.search(index="tmdb",body=query_body,size=2)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'hits': {u'hits': [{u'_id': u'310980',
    u'_index': u'tmdb',
    u'_score': 2.0078163,
    u'_type': u'movie',
    u'fields': {u'title': [u'Star']}},
   {u'_id': u'274871',
    u'_index': u'tmdb',
    u'_score': 1.2548852,
    u'_type': u'movie',
    u'fields': {u'title': [u'Stars in Brazil']}}],
  u'max_score': 2.0078163,
  u'total': 90},
 u'suggest': {u'title_completion': [{u'length': 9,
    u'offset': 0,
    u'options': [{u'highlighted': u'star <b>trek</b>',
      u'score': 0.0019600056,
      u'text': u'star trek'},
     {u'highlighted': u'star <b>they</b>',
      u'score': 0.0016621534,
      u'text': u'star they'}],
    u'text': u'star trec'}]},
 u'timed_out': False,
 u'took': 77}