In [None]:
import requests
import json
import os

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result


# To speed up the pace of development, we really need to focus more heavily on the analysis and query
# settings of the search engine, rather than fidly bits of the http interface.
#
# To that end, we're going to collapse some of the code you were introduced to in chapter 3 into more general functions,
# so we can reuse them. Largely, this is the exact same code you saw in chapter 3 some more generality.

## Analyze
## The analyze function is a helper for accessing the _analyze endpoint like we did in chapter 3. Recall,
## given a field or analyzer, passing some text to _analyze will return the token stream that results from
## that analyzer. This token stream, if you recall, shows us exactly how the search engine translate text
## into individual tokens to be consumed by the underlying data structures. When we debug analysis, we see
## matches we need to expect.
def analyze(text, field=None, analyzer=None):
    whatToAnalyze = ''
    if field is not None:
        whatToAnalyze = "field=%s" % field
    elif analyzer is not None:
        whatToAnalyze = "analyzer=%s" % analyzer
    resp = requests.get("http://localhost:9200/tmdb/_analyze?%s&format=yaml", whatToAnalyze, 
                        json={'field': field, 'analyzer': analyzer, 'text':text})
    print(resp.text)
    
## Search
## Next we need to wrap up our execution of query DSL queries. The function 'search' will execute the passed query DSL
## query and display the results. 
## If a scoring explain is associated with the results, then it also gets displayed,
## We'll also be sure to dump the query DSL
def search(query, verbose=False):
    url = 'http://localhost:9200/tmdb/movie/_search'
    httpResp = requests.get(url, json=query)
    if httpResp.status_code != 200:
        print("Search Failed <%s>" % httpResp.status_code)
        print("%s" % httpResp.text)
    searchHits = json.loads(httpResp.text)['hits']
    print("Num\tRelevance Score\t\tMovie Title")
    for idx, hit in enumerate(searchHits['hits']):
            castNames = []            
            castCharacters = []                        
            directorNames = []
            for cast in hit['_source']['cast']:
                castNames.append(cast['name'])
                castCharacters.append(cast['character'])
            for director in hit['_source']['directors']:
                directorNames.append(director['name'])
            print("%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title']))
            if verbose:
                print("%s" % hit['_source']['title'])
                print("%s" % hit['_source']['tagline'])      
                print("%s" % hit['_source']['overview'])
                print("%s" % hit['_id'])
                print("DIRS %s" % directorNames)
                print("CAST %s" % castNames)
                print("CHAR %s" % castCharacters)
                if '_explanation' in hit:
                    print("%s" % simplerExplain(hit['_explanation']))
                    print("*************************************")
    
    if verbose:
        httpResp = requests.get('http://localhost:9200' + 
                    '/tmdb/movie/_validate/query?explain',
                     json={'query': query['query']})
        print(json.loads(httpResp.text))

## Reindex
## Reindex takes analyzer and field mappings, recreates the index, and then reindexes
## TMDB movies using the _bulk index API. There are other ways for modifying the configuration
## of the index besides dropping and restarting, however for convenience and because our data
## isn't truly that large, we'll just delete and start from scratch when we need to.
def reindex(analysisSettings, mappingSettings=None, movieDict={}):
    # Destroy any existing index (equiv to SQL "drop table")
    resp = requests.delete("http://localhost:9200/tmdb")
    print("Delete TMDB Index <%s>" % resp.status_code)
    
    # Create the index with explicit settings
    # We need to explicitely set number of shards to 1 to eliminate the impact of 
    # distributed IDF on our small collection
    # See also "Relavance is Broken!"
    # http://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-is-broken.html
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis" : analysisSettings,
            }
        }
    }
    if mappingSettings:
        settings['mappings'] = mappingSettings
    resp = requests.put("http://localhost:9200/tmdb", json=settings)
    print("Create TMDB Index <%s>" % resp.status_code)
    if resp.status_code != 200:
        print(resp.text)
    
    # Bulk index title & overview to the movie endpoint
    print("Indexing %i movies" % len(movieDict.keys()))
    bulkMovies = ""
    for id, movie in movieDict.items():
        addCmd = {"index": {"_index": "tmdb", "_type": "movie", "_id": movie["id"]}}
        esDoc  = movie
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(esDoc) + "\n"
    resp = requests.post("http://localhost:9200/_bulk", data=bulkMovies, headers={"Content-Type":"application/x-ndjson"})
    print("Bulk Index into TMDB Index <%s>" % resp.status_code)

## Extract
## major difference between our use of TMDB here and in chapter 3: pulling more data. Not only do we access the 
## movie endpoint, we also extract the credits -- pulling in the cast (actors and such) and extracting the director.
def extract(movieIds=[], numMovies=10000):
    if len(movieIds) == 0:
        try:
            f = open('tmdb.json')
            if f:
                return json.loads(f.read());
        except IOError:
            pass       
    return movieDict

## Index to ES, Chapter 5 Settings

In [None]:
movieDict = extract([])

analysisSettings = {
   "analyzer" : {
      "default" : {
        "type" : "english"
      },
      "english_bigrams": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "standard",
            "lowercase",
            "porter_stem",
            "bigram_filter"
          ]
      }
    },
  "filter": {
    "bigram_filter": {
        "type": "shingle",
        "max_shingle_size":2,
        "min_shingle_size":2,
        "output_unigrams":"false"
    }
  }
}

            
mappingSettings = {
    'movie': {
        'properties': {
            # Some bug means I have to be explicit about analyzer (ie default analyzer)
            # doesn't apply
            "overview": {
                'type': 'text',
                 'analyzer': 'english',

            },
            "title": {
                'type': 'text',
                 'analyzer': 'english',

            },
            "cast": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams",
                            #"norms" : {
                            #   "enabled" : False
                            #}
                        }     
                      }
                   }
                   
               }
            },
            "directors": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams",
                            #"norms" : {
                            #   "enabled" : False
                            #}
                        }                       
                      },
                 
                   }
                   
               }
            }            
        }
    }
}

reindex(analysisSettings, mappingSettings, movieDict)

# Last Query from Chapter 5

In [None]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'bool': {
            'should': [ 
                {
                   'multi_match': { 
                      'query': usersSearch,  #User's query
                      'fields': ['directors.name.bigramed', #B 
				      'cast.name.bigramed'],
                      'type': 'cross_fields'
                      }
                 },
                {
                   'multi_match': {
                     'query': usersSearch,  #User's query
                     'fields': ['overview', 'title', #C
  'directors.name', 'cast.name'],
                     'type': 'cross_fields'                                
                   }
                },
            ]
        }
    },
    'size': 5,
    'explain': True
}
search(query, False) 


# 7.2.1, Listing 1 Base Query

In [None]:
usersSearch = 'william shatner patrick stewart'
query = {
    'query': {
        'multi_match': {
           'query': usersSearch,  #User's query
            'fields': ['overview', 'title', #C
                       'directors.name', 'cast.name'],
            'type': 'cross_fields'                                
         }
    },
    'size': 5,
    'explain': True
}
search(query, False)

# 7.2.3, Listing 2 Boosting with An Additional Boolean Clause

In [None]:
usersSearch = 'william shatner patrick stewart'
query = {
    'query': {
        'bool': {
            'should': [
            {'multi_match': { #A
               'query': usersSearch,  #User's query
                'fields': ['overview', 'title', #C
                           'directors.name', 'cast.name'],
                'type': 'cross_fields'                                
             }},
            { #B
               'match_phrase': {
                    'title': {
                        'query': 'star trek',
                        'boost': 1.0
                    }
                }
            }
            ]
        }
    },
    'size': 5,
    'explain': True
}
search(query)

# 7.2.3, Adjusted Boost Weight on Boolean Query (no listing no, modification of above listing)

In [None]:
usersSearch = 'william shatner patrick stewart'
query = {
    'query': {
        'bool': {
            'should': [
            {'multi_match': { #A
               'query': usersSearch,  #User's query
                'fields': ['overview', 'title', #C
                           'directors.name', 'cast.name'],
                'type': 'cross_fields'                                
             }},
            { #B
               'match_phrase': {
                    'title': {
                        'query': 'star trek',
                        'boost': 0.1
                    }
                }
            }
            ]
        }
    },
    'size': 5,
    'explain': True
}
search(query)

# 7.2.5, Listing 3 -- Multiplicative Boosting on Title Star Trek match

In [None]:
usersSearch = 'william shatner patrick stewart'
query = {
    'query': {
        'function_score': {
            'query': {
                 'multi_match': {
                   'query': usersSearch,  #User's query
                    'fields': ['overview', 'title', #C
                               'directors.name', 'cast.name'],
                    'type': 'cross_fields'                                
                 }
             },
             'functions': [
                {
                     'weight': 2.5,
                     'filter': {
                        'match': {
                            'title': 'star trek'
                        }
                     }
                 }
            ]
        }
    },
    'size': 50,
    'explain': True
}
search(query, False)

# 7.3, Listing 4 -- Using a Filter Instead of A Boost

In [None]:
usersSearch = 'william shatner patrick stewart'
query = {
    'query': {
        'bool': {
          'should': [
            {    'multi_match': {
                   'query': usersSearch,  #User's query
                    'fields': ['overview', 'title', #C
                               'directors.name', 'cast.name'],
                    'type': 'cross_fields'                                
                 }
            }],
          'filter': [{
            'match': {
                'title': 'star trek'
            }
          }]
        }
    },
    'size': 5,
    'explain': True
}
search(query)

# 7.4.2, Listings 5&7 Sentinel Tokens (includes exact name matching)

In [None]:
SENTINEL_BEGIN = 'SENTINEL_BEGIN'
SENTINEL_END = 'SENTINEL_END'
def transform(esDoc):    
    esDoc['title_exact_match'] = SENTINEL_BEGIN + ' ' + esDoc['title'] + ' ' + SENTINEL_END
    esDoc['names_exact_match'] = []
    for person in esDoc['cast'] + esDoc['directors']:
        esDoc['names_exact_match'].append(SENTINEL_BEGIN + ' ' + person['name'] + ' ' + SENTINEL_END)
        

for id, movie in movieDict.items():
    transform(movie)

        
reindex(analysisSettings, mappingSettings, movieDict)

# 7.4.2, Listing 6 -- Testing Exact Title Matching

In [None]:
usersSearch = 'Star Trek'
query = {
    'query': {
        'match_phrase': {
            'title_exact_match': {
                'query': SENTINEL_BEGIN + ' ' + usersSearch + ' ' + SENTINEL_END,
            }
        }
    },
    'size': 5,
    'explain': True
}
search(query, False)

# 7.4.3, Listing 8 Boolean Boost on Exact Title Matching

In [None]:
usersSearch = 'good will hunting'
query = {
    'query': {
        'bool': {
            'disable_coord': True,
            'should': [
                {'match_phrase': {
                    'title_exact_match': {
                        'query': SENTINEL_BEGIN + ' ' + usersSearch + ' ' + SENTINEL_END,
                        'boost': 1000,
                    }          
                }},
                {'multi_match': {
                   'query': usersSearch,  #User's query
                    'fields': ['overview', 'title', #C
                               'directors.name', 'cast.name'],
                    'type': 'cross_fields'                                
                 }},
                
            ]
        }
    },
    'size': 5,
    'explain': True
}
search(query)

## No Listing -- Adding a query mentions name boost

### First Attempt, search bigramed fields without modification

In [None]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'bool': {
            'disable_coord': True,
            'should': [
                {'match_phrase': {
                    'title_exact_match': {
                        'query': SENTINEL_BEGIN + ' ' + usersSearch + ' ' + SENTINEL_END,
                        'boost': 1000,
                    }          
                }},
                {'multi_match': {
                   'query': usersSearch,  #User's query
                    'fields': ['overview', 'title', #C
                               'directors.name', 'cast.name'],
                    'type': 'cross_fields'
                }},
                {'multi_match': {
                   'query': usersSearch,  #User's query
                    'fields': ['directors.name.bigramed', 'cast.name.bigramed'],
                    'type': 'cross_fields',
                   'boost': 100
                 }}
                
            ]
        }
    },
    'size': 20,
    'explain': True
}
search(query)

In [None]:
mappingSettings['movie']['properties'] \
               ['cast']['properties'] \
               ['name']['fields']['bigramed']['norms'] = {'enabled': False}
        
reindex(analysisSettings, mappingSettings, movieDict)

### Rerunning with Norms Off For Bigrams

In [None]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'bool': {
            'disable_coord': True,
            'should': [
                {'match_phrase': {
                    'title_exact_match': {
                        'query': SENTINEL_BEGIN + ' ' + usersSearch + ' ' + SENTINEL_END,
                        'boost': 1000,
                    }          
                }},
                {'multi_match': {
                   'query': usersSearch,  #User's query
                    'fields': ['overview', 'title', #C
                               'directors.name', 'cast.name'],
                    'type': 'cross_fields'
                }},
                {'multi_match': {
                   'query': usersSearch,  #User's query
                    'fields': ['directors.name.bigramed', 'cast.name.bigramed'],
                    'type': 'cross_fields',
                   'boost': 100
                 }}
                
            ]
        }
    },
    'size': 20,
    'explain': True
}
search(query)

# 7.4.4.1 Exact Name Matching Function Query Skeleton using TF*IDF (no listing number)

In [None]:
query = {
    'query': {
        'function_score': {
            'query': {
                'match_phrase': {
                    'names_exact_match': SENTINEL_BEGIN + ' william shatner ' + SENTINEL_END
                }
            },          
            'functions': [
               
            ]
        }        
    }
}
search(query)

# 7.4.4.1 Exact Name Matching Function, Ignoring TF*IDF

In [None]:
query = {
    'query': {
        'function_score': {
            'query': {
                'constant_score': {
                    'filter': {
                        'match_phrase': {
                            'names_exact_match': SENTINEL_BEGIN + ' william shatner ' + SENTINEL_END
                        }
                    },
                    'boost': 1000.0
                }
            },          
            'functions': [
            ]
        }        
    }
}
search(query)

# 7.4.4.1, Listing 9 Turning User Rating into A Signal

In [None]:
query = {
    'query': {
        'function_score': {
            'query': {
                'match_all': {}
            },          
            'functions': [
            {
                 "field_value_factor": {
                 "field": "vote_average",
                 "modifier": "sqrt"
                }
            }]
        }        
    }
}
search(query)

# 7.4.4.2 Listing 10, Turning Recency of Release Into A Signal (first try)

In [None]:
query = {
    'query': {
        'function_score': {
            'query': {
                'match_all': {}
            },          
            'functions': [
            {
                 "gauss": {
                    "release_date": {
                        "origin": "now",
                        "scale": "900d",
                        "decay": 0.5
                    }
                }
            }]
        }        
    },
    'explain': True
}
search(query)

# 7.4.4.2 Listing 10, Adjusted Gaussian Decay (set scale to 15 years)

In [None]:
query = {
    'query': {
        'function_score': {
            'query': {
                'match_all': {}
            },          
            'functions': [
            {
                 "gauss": {
                    "release_date": {
                        "origin": "now",
                        "scale": "5500d",
                        "decay": 0.5
                    }
                }
            }]
        }        
    },
    'explain': True
}
search(query)

# 7.4.4.3 -- Complete Name Boost

In [None]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'function_score': {
            'query': {
                'constant_score': {
                    'filter': {
                        'match_phrase': {
                            'names_exact_match': SENTINEL_BEGIN + ' ' + usersSearch + ' ' + SENTINEL_END
                        }
                    },
                    'boost': 1000.0
                }            
            },          
            'functions': [
            {
                 "gauss": {
                    "release_date": {
                        "origin": "now",
                        "scale": "5500d",
                        "decay": 0.5
                    }
                }
            },
            {
                "field_value_factor": {
                 "field": "vote_average",
                 "modifier": "sqrt"
                }
            }
            
            ]
        }        
    },
    'explain': True,
    'size': 20
}
search(query)

# Not Shown In Chapter: The Whole Shebang

In [None]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'bool': {
            'disable_coord': True,
            'should': [
                {'match_phrase': {
                    'title_exact_match': {
                        'query': SENTINEL_BEGIN + ' ' + usersSearch + ' ' + SENTINEL_END,
                        'boost': 1000,
                    }          
                }},
                {
                   'function_score': {
                        'query': {
                            'constant_score': {
                                'filter': {
                                    'match_phrase': {
                                        'names_exact_match': SENTINEL_BEGIN + ' ' + usersSearch + ' ' + SENTINEL_END
                                    }
                                },
                                'boost': 1000.0
                            }            
                        },          
                        'functions': [
                        {
                             "gauss": {
                                "release_date": {
                                    "origin": "now",
                                    "scale": "5500d",
                                    "decay": 0.5
                                }
                            }
                        },
                        {
                            "field_value_factor": {
                             "field": "vote_average",
                             "modifier": "sqrt"
                            }
                        }
                        
                        ]
                    }                    
                },
                {'multi_match': {
                   'query': usersSearch,  #User's query
                    'fields': ['overview', 'title', #C
                               'directors.name', 'cast.name'],
                    'type': 'cross_fields'                                
                 }},
                
            ]
        }
    },
    'size': 5,
    'explain': True
}
search(query)