# Setup Code (Listings 3 & 4) 5.2.1

In [None]:
import requests
import json
import os

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result


# To speed up the pace of development, we really need to focus more heavily on the analysis and query
# settings of the search engine, rather than fidly bits of the http interface.
#
# To that end, we're going to collapse some of the code you were introduced to in chapter 3 into more general functions,
# so we can reuse them. Largely, this is the exact same code you saw in chapter 3 some more generality.

## Analyze
## The analyze function is a helper for accessing the _analyze endpoint like we did in chapter 3. Recall,
## given a field or analyzer, passing some text to _analyze will return the token stream that results from
## that analyzer. This token stream, if you recall, shows us exactly how the search engine translate text
## into individual tokens to be consumed by the underlying data structures. When we debug analysis, we see
## matches we need to expect.
def analyze(text, field=None, analyzer=None):
    whatToAnalyze = ''
    if field is not None:
        whatToAnalyze = "field=%s" % field
    elif analyzer is not None:
        whatToAnalyze = "analyzer=%s" % analyzer
    resp = requests.get("http://localhost:9200/tmdb/_analyze?%s&format=yaml", whatToAnalyze, 
                        json={'field': field, 'analyzer': analyzer, 'text':text})
    print(resp.text)
    
## Search
## Next we need to wrap up our execution of query DSL queries. The function 'search' will execute the passed query DSL
## query and display the results. 
## If a scoring explain is associated with the results, then it also gets displayed,
## We'll also be sure to dump the query DSL
def search(query, verbose=False):
    url = 'http://localhost:9200/tmdb/movie/_search'
    httpResp = requests.get(url, json=query)
    if httpResp.status_code != 200:
        print("Search Failed <%s>" % httpResp.status_code)
        print("%s" % httpResp.text)
    searchHits = json.loads(httpResp.text)['hits']
    print("Num\tRelevance Score\t\tMovie Title")
    for idx, hit in enumerate(searchHits['hits']):
            castNames = []            
            castCharacters = []                        
            directorNames = []
            for cast in hit['_source']['cast']:
                castNames.append(cast['name'])
                castCharacters.append(cast['character'])
            for director in hit['_source']['directors']:
                directorNames.append(director['name'])
            print("%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title']))
            if verbose:
                print("%s" % hit['_source']['title'])
                print("%s" % hit['_source']['tagline'])      
                print("%s" % hit['_source']['overview'])
                print("%s" % hit['_id'])
                print("DIRS %s" % directorNames)
                print("CAST %s" % castNames)
                print("CHAR %s" % castCharacters)
                if '_explanation' in hit:
                    print("%s" % simplerExplain(hit['_explanation']))
                    print("*************************************")
    
    if verbose:
        httpResp = requests.get('http://localhost:9200' + 
                    '/tmdb/movie/_validate/query?explain',
                     json={'query': query['query']})
        print(json.loads(httpResp.text))

## Reindex
## Reindex takes analyzer and field mappings, recreates the index, and then reindexes
## TMDB movies using the _bulk index API. There are other ways for modifying the configuration
## of the index besides dropping and restarting, however for convenience and because our data
## isn't truly that large, we'll just delete and start from scratch when we need to.
def reindex(analysisSettings, mappingSettings=None, movieDict={}):
    # Destroy any existing index (equiv to SQL "drop table")
    resp = requests.delete("http://localhost:9200/tmdb")
    print("Delete TMDB Index <%s>" % resp.status_code)
    
    # Create the index with explicit settings
    # We need to explicitely set number of shards to 1 to eliminate the impact of 
    # distributed IDF on our small collection
    # See also "Relavance is Broken!"
    # http://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-is-broken.html
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis" : analysisSettings,
            }
        }
    }
    if mappingSettings:
        settings['mappings'] = mappingSettings
    resp = requests.put("http://localhost:9200/tmdb", json=settings)
    print("Create TMDB Index <%s>" % resp.status_code)
    if resp.status_code != 200:
        print(resp.text)
    
    # Bulk index title & overview to the movie endpoint
    print("Indexing %i movies" % len(movieDict.keys()))
    bulkMovies = ""
    for id, movie in movieDict.items():
        addCmd = {"index": {"_index": "tmdb", "_type": "movie", "_id": movie["id"]}}
        esDoc  = movie
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(esDoc) + "\n"
    resp = requests.post("http://localhost:9200/_bulk", data=bulkMovies, headers={"Content-Type":"application/x-ndjson"})
    print("Bulk Index into TMDB Index <%s>" % resp.status_code)

## Extract
## major difference between our use of TMDB here and in chapter 3: pulling more data. Not only do we access the 
## movie endpoint, we also extract the credits -- pulling in the cast (actors and such) and extracting the director.
def extract(movieIds=[], numMovies=10000):
    if len(movieIds) == 0:
        try:
            f = open('tmdb.json')
            if f:
                return json.loads(f.read());
        except IOError:
            pass       
    return movieDict

# 5.2.2 -- Listing 4, Index to ES, Search

In [None]:
movieDict = extract()

analysis = {
    "analyzer" : {
      "default" : {
        "type" : "english"
        }
      }
   }

reindex(analysisSettings=analysis, movieDict=movieDict)

usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
         }
    },
    'size': 5,
    'explain': True
}
search(query)
print("===============")
search(query, verbose=True)

# 5.2.4 -- Listing 5 Inspecting Nested Star Trek Docs

In [None]:
spaceJamId = 2300
httpResp = requests.get("http://localhost:9200/tmdb/movie/%s" % spaceJamId)
spaceJamDoc = json.loads(httpResp.text)
print(json.dumps(spaceJamDoc['_source'], indent=True))

# 5.3.1, Listing 6 Star Trek Query Using Query from Ch 3

In [None]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name', 'directors.name'],      
            'type': 'best_fields'
         }
    },
    'size': 50,
    'explain': True
}
search(query)
print "==============="
search(query, verbose=True)

# 5.3.2 -- Listing 7 -- Reducing the Impact of directors.name

In [None]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
                       'cast.name', 'directors.name^0.1'],  #A    
         }
    },
}
search(query)


# 5.3.3 -- Listings 8&9  – Analysis Extracting English Bigrams

In [None]:
analysisSettings = {
   "analyzer" : {
      "default" : {
        "type" : "english"
      },
      "english_bigrams": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "standard",
            "lowercase",
            "porter_stem",
            "bigram_filter"
          ]
      }
    },
  "filter": {
    "bigram_filter": {
        "type": "shingle",
        "max_shingle_size":2,
        "min_shingle_size":2,
        "output_unigrams":"false"
    }
  }
}


# From listing 9
mappingSettings = {
    'movie': {
        'properties': {
            "cast": {
               'properties': {
                  'name': {
                      'type': 'string',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "string",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            },
            "directors": {
               'properties': {
                  'name': {
                      'type': 'string',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "string",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            }            
        }
    }
}

reindex(analysisSettings, mappingSettings, movieDict)

# 5.3.3 -- Listing 10 -- Searching *.bigramed fields, reindexing

In [None]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
'cast.name.bigramed', 'directors.name.bigramed'],      
         }
    },
}
search(query)


# 5.3.4	Letting Losers Share The Glory (no listing number)

In [None]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed^5', 'directors.name.bigramed'],      
            'type': 'best_fields',
            'tie_breaker': 0.4
         }
    },
    'size': 5,
    'explain': True
}
search(query)
print "==============="
search(query, verbose=True)

# 5.3.5, Listing 11 Counting Multiple Signals using Most Fields 

In [None]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    }
}
search(query)


# 5.3.6, Listing 12	Boosting in Most-Fields

In [None]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.2', 'overview',
 'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    },
}
search(query)

# 5.3.7	When Additional Matches Don’t Matter (no listing number)

In [None]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    },
    'size': 5,
    'explain': True
}
search(query)