# Boilerplate Setup

In [1]:
import requests
import json


# Optional, enable client-side caching for TMDB
# Requires: https://httpcache.readthedocs.org/en/latest/
#from httpcache import CachingHTTPAdapter
#tmdb_api.mount('https://', CachingHTTPAdapter())
#tmdb_api.mount('http://', CachingHTTPAdapter())

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result

# 3.2.2 Indexing TMDB Movies

In [2]:
def extract():
    f = open('tmdb.json')
    if f:
         return json.loads(f.read());        
    return {}

In [3]:
def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = { #A
        "settings": {
            "number_of_shards": 1, #B
            "index": {
                "analysis" : analysisSettings, #C
            }}}

    if mappingSettings:
        settings['mappings'] = mappingSettings #C

    resp = requests.delete("http://localhost:9200/tmdb") #D
    resp = requests.put("http://localhost:9200/tmdb", 
                        data=json.dumps(settings))

    bulkMovies = ""
    print("building...")
    for id, movie in movieDict.items(): 
        addCmd = {"index": {"_index": "tmdb", #E
                            "_type": "movie",
                            "_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"

    print("indexing...")
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html
    resp = requests.post("http://localhost:9200/_bulk", data=bulkMovies, headers={"Content-Type":"application/x-ndjson"})
    print("Create index complete, resp=%s" % resp.text[:100])


In [4]:
movieDict = extract()
reindex(movieDict=movieDict)


building...
indexing...
Create index complete, resp={"took":5513,"errors":false,"items":[{"index":{"_index":"tmdb","_type":"movie","_id":"93837","_versi


# 3.2.3 Basic Searching

In [5]:
def search(query):
    url = 'http://localhost:9200/tmdb/movie/_search'
    httpResp = requests.get(url, json=query) #A
    print("Http resp=%s" % httpResp.text[:100])
    searchHits = json.loads(httpResp.text)['hits']
    print ("Num\tRelevance Score\t\tMovie Title\t\tOverview") #B
    for idx, hit in enumerate(searchHits['hits']):
            print("%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title']))


In [6]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '100'
}
search(query)


Http resp={"took":247,"timed_out":false,"_shards":{"total":5,"successful":5,"skipped":0,"failed":0},"hits":{"t
Num	Relevance Score		Movie Title		Overview
1	71.4475		Aliens
2	59.804768		Cowboys & Aliens
3	57.58971		The Basketball Diaries
4	57.58971		Monsters vs Aliens
5	49.727566		Friends with Kids
6	49.44959		Aliens in the Attic
7	45.1047		Aliens vs Predator: Requiem
8	43.483475		From Russia With Love
9	43.22128		Hobo with a Shotgun
10	42.45948		Dances with Wolves
11	39.850166		Friends with Benefits
12	39.850166		Fire with Fire
13	39.77869		Interview with the Vampire
14	39.77869		From Paris with Love
15	39.77869		Trouble with the Curve
16	39.77869		To Rome with Love
17	38.378033		Fun with Dick and Jane
18	37.119114		Gone with the Wind
19	37.119114		Just Go With It
20	37.119114		Sleeping with the Enemy
21	34.888588		My Week with Marilyn
22	34.7553		You Don't Mess With the Zohan
23	34.510853		The Girl Who Played with Fire
24	32.972034		Girl with a Pearl Earring
25	31.93021		The Girl with

# 2.3.1 Query Validation API

In [7]:
query = {
   'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^10', 'overview']
        }
    }
}
httpResp = requests.get('http://localhost:9200' + 
			    '/tmdb/movie/_validate/query?explain',
			     json=query)
print(json.loads(httpResp.text))

{'_shards': {'total': 1, 'successful': 1, 'failed': 0}, 'valid': True, 'explanations': [{'index': 'tmdb', 'valid': True, 'explanation': '+((title:basketball title:with title:cartoon title:aliens)^10.0 | (overview:basketball overview:with overview:cartoon overview:aliens)) #*:*'}]}


# 2.3.3 Debugging Analysis

In [14]:
# Inner Layer of the Onion -- Why did the search engine consider these movies matches? Two sides to this
# (1) What tokens are placed in the search engine?
# (2) What did the search engine attempt to match exactly?

# Explain of what's happening when we construct these terms

#resp = requests.get(elasticSearchUrl + "/tmdb/_mapping/movie/field/title?format=yaml'
resp = requests.get('http://localhost:9200/tmdb/_analyze?format=yaml', 
                    json={"field":"title", "text":"Fire with Fire"})
print(resp.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



# 2.3.5 -- Solving The Matching Problem

In [21]:
mappingSettings = {
       'movie': {
            'properties': {
               'title': { #A
                   'type': 'string',
                   'analyzer': 'english'
               },
                'overview': {
                   'type': 'string',
                   'analyzer': 'english'
               }
            }
       }
}
reindex(mappingSettings=mappingSettings, movieDict=movieDict) 


building...
indexing...
Create index complete, resp={"took":7288,"errors":false,"items":[{"index":{"_index":"tmdb","_type":"movie","_id":"93837","_versi


In [26]:
resp = requests.get('http://localhost:9200/tmdb/_analyze?format=yaml', 
                    json={"field":"title", "text":"Fire with Fire"})
print(resp.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



## Repeat the search

In [18]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '100'
}
search(query)


Http resp={"took":235,"timed_out":false,"_shards":{"total":5,"successful":5,"skipped":0,"failed":0},"hits":{"t
Num	Relevance Score		Movie Title		Overview
1	71.4475		Aliens
2	59.804768		Cowboys & Aliens
3	57.58971		The Basketball Diaries
4	57.58971		Monsters vs Aliens
5	49.727566		Friends with Kids
6	49.44959		Aliens in the Attic
7	45.1047		Aliens vs Predator: Requiem
8	43.483475		From Russia With Love
9	43.22128		Hobo with a Shotgun
10	42.45948		Dances with Wolves
11	39.850166		Friends with Benefits
12	39.850166		Fire with Fire
13	39.77869		Interview with the Vampire
14	39.77869		From Paris with Love
15	39.77869		Trouble with the Curve
16	39.77869		To Rome with Love
17	38.378033		Fun with Dick and Jane
18	37.119114		Gone with the Wind
19	37.119114		Just Go With It
20	37.119114		Sleeping with the Enemy
21	34.888588		My Week with Marilyn
22	34.7553		You Don't Mess With the Zohan
23	34.510853		The Girl Who Played with Fire
24	32.972034		Girl with a Pearl Earring
25	31.93021		The Girl with

# 2.4.1	Decomposing Relevance Score With Lucene’s Explain

In [None]:
query['explain'] = True
httpResp = requests.get('http://localhost:9200/tmdb/movie/_search', data=json.dumps(query))
jsonResp = json.loads(httpResp.text)
print json.dumps(jsonResp['hits']['hits'][0]['_explanation'], indent=True)
print "Explain for %s" % jsonResp['hits']['hits'][0]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][0]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][1]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][1]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][2]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][2]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][3]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][3]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][10]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][10]['_explanation'])


# 3.4.4	Fixing Space Jam vs Alien Ranking

In [None]:
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
        }
    },
    'explain': True
}
search(query)
