In [61]:
import requests
import json


# Optional, enable client-side caching for TMDB
# Requires: https://httpcache.readthedocs.org/en/latest/
#from httpcache import CachingHTTPAdapter
#tmdb_api.mount('https://', CachingHTTPAdapter())
#tmdb_api.mount('http://', CachingHTTPAdapter())

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result

In [62]:
from opensearchpy import OpenSearch
import json



# Optional, enable client-side caching for TMDB
# Requires: https://httpcache.readthedocs.org/en/latest/
#from httpcache import CachingHTTPAdapter
#tmdb_api.mount('https://', CachingHTTPAdapter())
#tmdb_api.mount('http://', CachingHTTPAdapter())

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result

In [63]:
import sys
!{sys.executable} -m pip install opensearch-dsl



In [64]:
from opensearchpy import OpenSearch
from opensearch_dsl import Search
import json

In [65]:
def extract():
    f = open('tmdb.json')
    if f:
         return json.loads(f.read());        
    return {}

In [66]:
client = OpenSearch(
    hosts = [{'host': 'localhost', 'port': 9200}],
    http_compress = True,
    http_auth = ('admin','admin'),
    use_ssl = False
)

In [67]:
def reindex(client: OpenSearch, analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = { #A
        "settings": {
            "number_of_shards": 1, #B
            "number_of_replicas": 1
        }}

    if mappingSettings:
        settings['mappings'] = mappingSettings #C
    
    resp = client.indices.delete("tmdb")
    resp = client.indices.create("tmdb") 

    bulkMovies = ""
    print("building...")
    for id, movie in movieDict.items(): 
        addCmd = {"index": {"_index": "tmdb", #E
                            "_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"

    print("indexing...")
    resp = client.bulk(index='tmdb', body=bulkMovies)


In [68]:
movieDict = extract()
#movieDict

In [89]:
reindex(client, movieDict=movieDict)

building...
indexing...


# 3.2.3 Basic Searching

In [109]:
def print_results_table(searchHits):
    print("Num\tRelevance Score\t\tMovie Title\tOverview") #B
    idx = 0 
    for hit in searchHits['hits']: 
        idx = idx + 1
        score = hit['_score']
        print(f"{idx}\t{score}\t{hit['_source']['title']}\t{hit['_source']['overview']}")

In [91]:
def search(client: OpenSearch, query, explain=False):
    resp = client.search(body=query,
                        index = 'tmdb',
                        explain=explain)
    
    searchHits = resp['hits']
    return searchHits





In [136]:
usersSearch = 'basketball with cartoon aliens'
query = {
   "query" : {
      "multi_match" : {
         "query": usersSearch,
         "fields": ["title^10","overview"]
      }
   },
   "rescore" : {
      "window_size" : 50,
      "query" : {
         "rescore_query" : {
            "match_phrase" : {
               "overview" : {
                  "query" : "michael jordan",
                  "slop" : 2
               }
            }
         },
         "query_weight" : 0.7,
         "rescore_query_weight" : 2.1
      }
   }
}

    
print_results_table(search(client, query))



Num	Relevance Score		Movie Title	Overview
1	59.898502	Aliens	When Ripley's lifepod is found by a salvage crew over 50 years later, she finds that terra-formers are on the very planet they found the alien species. When the company sends a family of colonists out to investigate her story... all contact is lost with the planet and colonists. They enlist Ripley and the colonial marines to return and search for answers.
2	51.597538	The Basketball Diaries	Film adaptation of street tough Jim Carroll's epistle about his kaleidoscopic free fall into the harrowing world of drug addiction.
3	49.924137	Cowboys & Aliens	A stranger stumbles into the desert town of Absolution with no memory of his past and a futuristic shackle around his wrist. With the help of mysterious beauty Ella and the iron-fisted Colonel Dolarhyde, he finds himself leading an unlikely posse of cowboys, outlaws, and Apache warriors against a common enemy from beyond this world in an epic showdown for survival.
4	42.797455	Monst

In [21]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '100'
}
print_results_table(search(client, query))


Num	Relevance Score		Movie Title	
1	85.56929	Aliens	
2	73.71077	The Basketball Diaries	
3	71.3202	Cowboys & Aliens	
4	61.13922	Monsters vs Aliens	
5	53.501823	Aliens vs Predator: Requiem	
6	53.501823	Aliens in the Attic	
7	45.221096	Dances with Wolves	
8	45.221096	Friends with Benefits	
9	45.221096	Fire with Fire	
10	45.221096	Friends with Kids	
11	39.572163	Interview with the Vampire	
12	39.572163	From Russia With Love	
13	39.572163	Gone with the Wind	
14	39.572163	Just Go With It	
15	39.572163	My Week with Marilyn	
16	39.572163	From Paris with Love	
17	39.572163	Trouble with the Curve	
18	39.572163	Sleeping with the Enemy	
19	39.572163	Hobo with a Shotgun	
20	39.572163	To Rome with Love	
21	35.17782	Die Hard: With a Vengeance	
22	35.17782	Girl with a Pearl Earring	
23	35.17782	Fun with Dick and Jane	
24	31.661873	The Girl with the Dragon Tattoo	
25	31.661873	The Life Aquatic With Steve Zissou	
26	31.661873	Twin Peaks: Fire Walk with Me	
27	31.661873	You Don't Mess With the Zohan	
28	

# 2.3.1 Query Validation API

In [22]:
from IPython.display import JSON

query = {
   'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^10', 'overview']
        }
    }
}
resp = search(client, query, explain=True)
JSON(resp)


<IPython.core.display.JSON object>

# 2.3.3 Debugging Analysis

In [30]:
# Inner Layer of the Onion -- Why did the search engine consider these movies matches? Two sides to this
# (1) What tokens are placed in the search engine?
# (2) What did the search engine attempt to match exactly?

import requests
from requests.auth import HTTPBasicAuth
basic = HTTPBasicAuth('admin', 'admin')

def analyze(body, basic):
    resp = requests.get('http://localhost:9200/tmdb/_analyze?format=yaml', 
                    data=json.dumps(body), auth=basic, headers={'content-type': 'application/json'})
    return resp.text

body = { "text": "Fire with Fire", "field": "title" }
print(analyze(body, basic))

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



# 2.3.5 -- Solving The Matching Problem

In [32]:
mappingSettings = {
       'movie': {
            'properties': {
               'title': { #A
                   'type': 'string',
                   'analyzer': 'english'
               },
            'overview': {
                   'type': 'string',
                   'analyzer': 'english'
               }
            }
       }
}
reindex(client, mappingSettings=mappingSettings, movieDict=movieDict) 


building...
indexing...


In [33]:
print(analyze(body, basic))

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



In [None]:
## Repeat the search

In [35]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '100'
}
JSON(search(client, query))


<IPython.core.display.JSON object>

In [None]:
# 2.4.1	Decomposing Relevance Score With Lucene’s Explain

In [53]:
query['explain'] = True
httpResp = requests.get('http://localhost:9200/tmdb/_search', 
                        data=json.dumps(query), auth=basic,
                        headers={'content-type': 'application/json'})
jsonResp = json.loads(httpResp.text)
#print(json.dumps(jsonResp['hits']['hits'][0]['_explanation'], indent=True))
print("Explain for %s" % jsonResp['hits']['hits'][0]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][0]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][1]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][1]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][2]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][2]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][3]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][3]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][9]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][9]['_explanation']))



Explain for Meet Dave
9.305035, max of:
  9.305035, sum of:
    1.1264597, weight(overview:with in 799) [PerFieldSimilarity], result of:
      1.1264597, score(freq=1.0), computed as boost * idf * tf from:
        2.2, boost
        0.9614616, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
          1166, n, number of documents containing term
          3050, N, total number of documents with field
        0.5325508, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
          1.0, freq, occurrences of term within document
          1.2, k1, term saturation parameter
          0.75, b, length normalization parameter
          34.0, dl, length of field
          52.963608, avgdl, average length of field
    8.178575, weight(overview:aliens in 799) [PerFieldSimilarity], result of:
      8.178575, score(freq=2.0), computed as boost * idf * tf from:
        2.2, boost
        5.3490763, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
          14, n, n

# 3.4.4	Fixing Space Jam vs Alien Ranking

In [55]:
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
        }
    },
    'explain': True
}
JSON(search(client, query))


<IPython.core.display.JSON object>