In [36]:
from opensearchpy import OpenSearch
from IPython.display import JSON
import json



# Optional, enable client-side caching for TMDB
# Requires: https://httpcache.readthedocs.org/en/latest/
#from httpcache import CachingHTTPAdapter
#tmdb_api.mount('https://', CachingHTTPAdapter())
#tmdb_api.mount('http://', CachingHTTPAdapter())

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result

In [5]:
import sys
!{sys.executable} -m pip install opensearch-dsl



In [6]:
from opensearchpy import OpenSearch
from opensearch_dsl import Search
import json

In [7]:
def extract():
    f = open('tmdb.json')
    if f:
         return json.loads(f.read());        
    return {}

In [8]:
client = OpenSearch(
    hosts = [{'host': 'localhost', 'port': 9200}],
    http_compress = True,
    http_auth = ('admin','admin'),
    use_ssl = False
)

## 3.3 Indexing with OpenSearch Bulk API

In [9]:
def reindex(client: OpenSearch, analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = { #A
        "settings": {
            "number_of_shards": 1, #B
            "number_of_replicas": 1
        }}

    if mappingSettings:
        settings['mappings'] = mappingSettings #C
    
    index_exists = client.indices.exists("tmdb")
    print(index_exists)
    if index_exists:
        resp = client.indices.delete("tmdb")
        print(f"deleted tmdb: {resp}")
        resp = client.indices.create("tmdb") 
    else:
        resp = client.indices.create("tmdb") 

    bulkMovies = ""
    print("building...")
    for id, movie in movieDict.items(): 
        addCmd = {"index": {"_index": "tmdb", #E
                            "_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"

    print("indexing...")
    resp = client.bulk(index='tmdb', body=bulkMovies)


In [10]:
movieDict = extract()
#movieDict

In [11]:
reindex(client, movieDict=movieDict)

True
deleted tmdb: {'acknowledged': True}
building...
indexing...


# 3.6 Basic Searching

In [13]:
def print_results_table(searchHits):
    print("Num\tRelevance Score\t\tMovie Title\tOverview") #B
    idx = 0 
    for hit in searchHits['hits']: 
        idx = idx + 1
        score = hit['_score']
        print(f"{idx}\t{score}\t{hit['_source']['title']}\t") #{hit['_source']['overview']}")

In [22]:
def search(client: OpenSearch, query, explain=False, from_=0, size=10):
    resp = client.search(body=query,
                        index = 'tmdb',
                        explain=explain,
                        from_=from_,
                        size=size)
    
    searchHits = resp['hits']
    return searchHits





In [23]:

usersSearch = 'basketball with cartoon aliens'
query = {
   "query" : {
      "multi_match" : {
         "query": usersSearch,
         "fields": ["title^10","overview"]
      }
   },
}

    
print_results_table(search(client, query, size=50))



Num	Relevance Score		Movie Title	Overview
1	85.56929	Aliens	
2	73.71077	The Basketball Diaries	
3	71.3202	Cowboys & Aliens	
4	61.13922	Monsters vs Aliens	
5	53.501823	Aliens vs Predator: Requiem	
6	53.501823	Aliens in the Attic	
7	45.221096	Dances with Wolves	
8	45.221096	Friends with Benefits	
9	45.221096	Fire with Fire	
10	45.221096	Friends with Kids	
11	39.572163	Interview with the Vampire	
12	39.572163	From Russia With Love	
13	39.572163	Gone with the Wind	
14	39.572163	Just Go With It	
15	39.572163	My Week with Marilyn	
16	39.572163	From Paris with Love	
17	39.572163	Trouble with the Curve	
18	39.572163	Sleeping with the Enemy	
19	39.572163	Hobo with a Shotgun	
20	39.572163	To Rome with Love	
21	35.17782	Die Hard: With a Vengeance	
22	35.17782	Girl with a Pearl Earring	
23	35.17782	Fun with Dick and Jane	
24	31.661873	The Girl with the Dragon Tattoo	
25	31.661873	The Life Aquatic With Steve Zissou	
26	31.661873	Twin Peaks: Fire Walk with Me	
27	31.661873	You Don't Mess With the Zo

### Listing 3.7. Explaining the behavior of your query

In [13]:
# does not return the Lucene query syntax
JSON(search(client, query, explain=True))

<IPython.core.display.JSON object>

In [38]:
# validate is not accessible via python module?
def validate(query, explain=False, from_=0, size=10):
    import requests
    resp = requests.get(f"http://localhost:9200/tmdb/_validate/query?explain",
                 data=json.dumps(query),
                 headers={ 'content-type': 'application/json' })
    return json.loads(resp.text)


In [39]:
usersSearch = 'basketball with cartoon aliens'
query = {
   "query" : {
      "multi_match" : {
         "query": usersSearch,
         "fields": ["title^10","overview"]
      }
   }
}

JSON(validate(query))

<IPython.core.display.JSON object>

In [91]:
# validate is not accessible via python module?
def analyze(query):
    import requests
    resp = requests.get(f"http://localhost:9200/tmdb/_analyze",
                 data=json.dumps(query),
                 headers = {'content-type': 'application/json' })
                
    return resp.text


In [97]:

query = { "text": "Fire with Fire", "analyzer": "standard" }
JSON(analyze(query))

<Response [200]>


<IPython.core.display.JSON object>

#### Rescoring

In [33]:

usersSearch = 'basketball with cartoon aliens'
query = {
   "query" : {
      "multi_match" : {
         "query": usersSearch,
         "fields": ["title^10","overview"]
      }
   },
   "rescore" : {
      "window_size" : 50,
      "query" : {
         "rescore_query" : {
            "match_phrase" : {
               "overview" : {
                  "query" : "michael jordan",
                  "slop" : 2
               }
            }
         },
         "query_weight" : 0.7,
         "rescore_query_weight" : 2.1
      }
   }
}

    
validate(query)



'{"valid":false,"error":"org.opensearch.common.ParsingException: request does not support [rescore]"}'

In [None]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '10'
}
print_results_table(search(client, query))


# 2.3.1 Query Validation API

In [None]:


query = {
   'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^10', 'overview']
        }
    }
}
resp = search(client, query, explain=True)
JSON(resp)


# 2.3.3 Debugging Analysis

In [None]:
# Inner Layer of the Onion -- Why did the search engine consider these movies matches? Two sides to this
# (1) What tokens are placed in the search engine?
# (2) What did the search engine attempt to match exactly?

import requests
from requests.auth import HTTPBasicAuth
basic = HTTPBasicAuth('admin', 'admin')

def analyze(body, basic, analyzer=None):
    body = { 
        "text": "Fire with Fire", 
        "field": "title",
    }

    query_string = "format=yaml"
    if analyzer != None:
        body["analyzer"] = analyzer

    resp = requests.post(f'http://localhost:9200/tmdb/_analyze?{query_string}', 
        data=json.dumps(body), auth=basic, headers={'content-type': 'application/json'})
    return resp.text

print(analyze(body, basic))

# 2.3.5 -- Solving The Matching Problem

In [None]:
mappingSettings = {
    'properties': {
       'title': { #A
           'type': 'string',
           'analyzer': 'english'
       },
       'overview': {
           'type': 'string',
           'analyzer': 'english'
        }
    }
}

reindex(client, mappingSettings=mappingSettings, movieDict=movieDict) 


In [None]:
## Repeat the search

In [None]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '10'
}
JSON(search(client, query))


In [None]:
# 2.4.1	Decomposing Relevance Score With Lucene’s Explain

In [None]:
query['explain'] = True
httpResp = requests.get('http://localhost:9200/tmdb/_search', 
                        data=json.dumps(query), auth=basic,
                        headers={'content-type': 'application/json'})
jsonResp = json.loads(httpResp.text)
#print(json.dumps(jsonResp['hits']['hits'][0]['_explanation'], indent=True))
print("Explain for %s" % jsonResp['hits']['hits'][0]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][0]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][1]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][1]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][2]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][2]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][3]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][3]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][9]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][9]['_explanation']))



# 3.4.4	Fixing Space Jam vs Alien Ranking

In [None]:
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
        }
    },
    'explain': True
}
JSON(search(client, query))
