# Boilerplate Setup

In [1]:
import requests
import json


# Optional, enable client-side caching for TMDB
# Requires: https://httpcache.readthedocs.org/en/latest/
#from httpcache import CachingHTTPAdapter
#tmdb_api.mount('https://', CachingHTTPAdapter())
#tmdb_api.mount('http://', CachingHTTPAdapter())

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result

# 3.2.2 Indexing TMDB Movies

In [2]:
def extract():
    f = open('tmdb.json')
    if f:
         return json.loads(f.read());        
    return {}

In [3]:
def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = { #A
        "settings": {
            "number_of_shards": 1, #B
            "index": {
                "analysis" : analysisSettings, #C
            }}}

    if mappingSettings:
        settings['mappings'] = mappingSettings #C

    resp = requests.delete("http://localhost:9200/tmdb") #D
    resp = requests.put("http://localhost:9200/tmdb", 
                        json=settings)
    print("resp=%s" % resp.text)

    bulkMovies = ""
    print("building...")
    for id, movie in movieDict.items(): 
        addCmd = {"index": {"_index": "tmdb", #E
                            "_type": "movie",
                            "_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"

    print("indexing...")
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html
    resp = requests.post("http://localhost:9200/_bulk", data=bulkMovies, headers={"Content-Type":"application/x-ndjson"})
    print("Create index complete, resp=%s" % resp.text[:100])


In [4]:
movieDict = extract()
reindex(movieDict=movieDict)


resp={"acknowledged":true,"shards_acknowledged":true,"index":"tmdb"}
building...
indexing...
Create index complete, resp={"took":3534,"errors":false,"items":[{"index":{"_index":"tmdb","_type":"movie","_id":"93837","_versi


# 3.2.3 Basic Searching

In [5]:
def search(query):
    url = 'http://localhost:9200/tmdb/movie/_search'
    httpResp = requests.get(url, json=query) #A
    print("Http resp=%s" % httpResp.text[:100])
    searchHits = json.loads(httpResp.text)['hits']
    print ("Num\tRelevance Score\t\tMovie Title") #B
    for idx, hit in enumerate(searchHits['hits']):
            print("%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title']))


In [6]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '100'
}
search(query)


Http resp={"took":70,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"to
Num	Relevance Score		Movie Title
1	85.5693		Aliens
2	73.71077		The Basketball Diaries
3	71.3202		Cowboys & Aliens
4	61.13922		Monsters vs Aliens
5	53.501823		Aliens vs Predator: Requiem
6	53.501823		Aliens in the Attic
7	45.221092		Dances with Wolves
8	45.221092		Friends with Benefits
9	45.221092		Fire with Fire
10	45.221092		Friends with Kids
11	39.57216		Interview with the Vampire
12	39.57216		From Russia With Love
13	39.57216		Gone with the Wind
14	39.57216		Just Go With It
15	39.57216		My Week with Marilyn
16	39.57216		From Paris with Love
17	39.57216		Trouble with the Curve
18	39.57216		Sleeping with the Enemy
19	39.57216		Hobo with a Shotgun
20	39.57216		To Rome with Love
21	35.177814		Die Hard: With a Vengeance
22	35.177814		Girl with a Pearl Earring
23	35.177814		Fun with Dick and Jane
24	31.661877		The Girl with the Dragon Tattoo
25	31.661877		The Life Aquatic With St

# 2.3.1 Query Validation API

In [7]:
query = {
   'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^10', 'overview']
        }
    }
}
httpResp = requests.get('http://localhost:9200' + 
			    '/tmdb/movie/_validate/query?explain',
			     json=query)
print(json.loads(httpResp.text))

{'_shards': {'total': 1, 'successful': 1, 'failed': 0}, 'valid': True, 'explanations': [{'index': 'tmdb', 'valid': True, 'explanation': '+((title:basketball title:with title:cartoon title:aliens)^10.0 | (overview:basketball overview:with overview:cartoon overview:aliens)) #*:*'}]}


# 2.3.3 Debugging Analysis

In [8]:
# Inner Layer of the Onion -- Why did the search engine consider these movies matches? Two sides to this
# (1) What tokens are placed in the search engine?
# (2) What did the search engine attempt to match exactly?

# Explain of what's happening when we construct these terms

#resp = requests.get(elasticSearchUrl + "/tmdb/_mapping/movie/field/title?format=yaml'
resp = requests.get('http://localhost:9200/tmdb/_analyze?format=yaml', 
                    json={"field":"title", "text":"Fire with Fire"})
print(resp.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



# 2.3.5 -- Solving The Matching Problem

In [9]:
mappingSettings = {
       'movie': {
            'properties': {
               'title': { #A
                   'type': 'text',
                   'analyzer': 'english'
               },
                'overview': {
                   'type': 'text',
                   'analyzer': 'english'
               }
            }
       }
}
reindex(mappingSettings=mappingSettings, movieDict=movieDict) 


resp={"acknowledged":true,"shards_acknowledged":true,"index":"tmdb"}
building...
indexing...
Create index complete, resp={"took":2866,"errors":false,"items":[{"index":{"_index":"tmdb","_type":"movie","_id":"93837","_versi


In [10]:
resp = requests.get('http://localhost:9200/tmdb/_analyze?format=yaml', 
                    json={"field":"title", "text":"Fire with Fire"})
print(resp.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



## Repeat the search

In [11]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '100'
}
search(query)


Http resp={"took":36,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"to
Num	Relevance Score		Movie Title
1	78.76022		The Basketball Diaries
2	74.090744		Alien
3	74.090744		Aliens
4	74.090744		Alien³
5	59.677		Cowboys & Aliens
6	59.677		Aliens in the Attic
7	59.677		Alien: Resurrection
8	49.95806		Monsters vs Aliens
9	42.96141		Aliens vs Predator: Requiem
10	42.96141		AVP: Alien vs. Predator
11	12.882349		Space Jam
12	7.5384703		Grown Ups
13	7.4996777		Speed Racer
14	7.2440877		Semi-Pro
15	7.162643		The Flintstones
16	6.943389		Coach Carter
17	6.7653713		White Men Can't Jump
18	5.845222		Meet Dave
19	5.440302		Bedazzled
20	5.3304057		High School Musical
21	5.3242		The Thing
22	5.1603985		The Darkest Hour
23	5.1603985		Invasion of the Body Snatchers
24	5.0822196		Slither
25	5.052847		District 9
26	5.0258336		Teen Wolf
27	4.8694844		Independence Day
28	4.8694844		Dude, Where’s My Car?
29	4.8694844		Edge of Tomorrow
30	4.8612785		Escape from Planet Ea

# 2.4.1	Decomposing Relevance Score With Lucene’s Explain

In [12]:
query['explain'] = True
httpResp = requests.get('http://localhost:9200/tmdb/movie/_search', json=query)
jsonResp = json.loads(httpResp.text)
print(json.dumps(jsonResp['hits']['hits'][0]['_explanation'], indent=True))
print("Explain for %s" % jsonResp['hits']['hits'][0]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][0]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][1]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][1]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][2]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][2]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][3]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][3]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][10]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][10]['_explanation']))


{
 "value": 78.76022,
 "description": "max of:",
 "details": [
  {
   "value": 78.76022,
   "description": "sum of:",
   "details": [
    {
     "value": 78.76022,
     "description": "weight(title:basketbal in 1279) [PerFieldSimilarity], result of:",
     "details": [
      {
       "value": 78.76022,
       "description": "score(doc=1279,freq=1.0 = termFreq=1.0\n), product of:",
       "details": [
        {
         "value": 10.0,
         "description": "boost",
         "details": []
        },
        {
         "value": 7.6180873,
         "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
         "details": [
          {
           "value": 1.0,
           "description": "docFreq",
           "details": []
          },
          {
           "value": 3051.0,
           "description": "docCount",
           "details": []
          }
         ]
        },
        {
         "value": 1.0338583,
         "description": "tfNorm, computed 

# 3.4.4	Fixing Space Jam vs Alien Ranking

In [13]:
query = {
    'query': {
        'multi_match': {
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
        }
    },
    'explain': True
}
search(query)


Http resp={"took":38,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"to
Num	Relevance Score		Movie Title
1	12.882349		Space Jam
2	7.5384703		Grown Ups
3	7.4996777		Speed Racer
4	7.2440877		Semi-Pro
5	7.162643		The Flintstones
6	6.943389		Coach Carter
7	6.7653713		White Men Can't Jump
8	5.845222		Meet Dave
9	5.8005633		Aliens vs Predator: Requiem
10	5.440302		Bedazzled
