# Setup Code from Prev Chapters (run first)

In [1]:
import requests
import json
import os

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result


# To speed up the pace of development, we really need to focus more heavily on the analysis and query
# settings of the search engine, rather than fidly bits of the http interface.
#
# To that end, we're going to collapse some of the code you were introduced to in chapter 3 into more general functions,
# so we can reuse them. Largely, this is the exact same code you saw in chapter 3 some more generality.

## Analyze
## The analyze function is a helper for accessing the _analyze endpoint like we did in chapter 3. Recall,
## given a field or analyzer, passing some text to _analyze will return the token stream that results from
## that analyzer. This token stream, if you recall, shows us exactly how the search engine translate text
## into individual tokens to be consumed by the underlying data structures. When we debug analysis, we see
## matches we need to expect.
def analyze(text, field=None, analyzer=None):
    whatToAnalyze = ''
    if field is not None:
        whatToAnalyze = "field=%s" % field
    elif analyzer is not None:
        whatToAnalyze = "analyzer=%s" % analyzer
    resp = requests.get("http://localhost:9200/tmdb/_analyze?%s&format=yaml", whatToAnalyze, 
                        json={'field': field, 'analyzer': analyzer, 'text':text})
    print(resp.text)
    
## Search
## Next we need to wrap up our execution of query DSL queries. The function 'search' will execute the passed query DSL
## query and display the results. 
## If a scoring explain is associated with the results, then it also gets displayed,
## We'll also be sure to dump the query DSL
def search(query, verbose=False):
    url = 'http://localhost:9200/tmdb/movie/_search'
    httpResp = requests.get(url, json=query)
    if httpResp.status_code != 200:
        print("Search Failed <%s>" % httpResp.status_code)
        print("%s" % httpResp.text)
    searchHits = json.loads(httpResp.text)['hits']
    print("Num\tRelevance Score\t\tMovie Title")
    for idx, hit in enumerate(searchHits['hits']):
            castNames = []            
            castCharacters = []                        
            directorNames = []
            for cast in hit['_source']['cast']:
                castNames.append(cast['name'])
                castCharacters.append(cast['character'])
            for director in hit['_source']['directors']:
                directorNames.append(director['name'])
            print("%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title']))
            if verbose:
                print("%s" % hit['_source']['title'])
                print("%s" % hit['_source']['tagline'])      
                print("%s" % hit['_source']['overview'])
                print("%s" % hit['_id'])
                print("DIRS %s" % directorNames)
                print("CAST %s" % castNames)
                print("CHAR %s" % castCharacters)
                if '_explanation' in hit:
                    print("%s" % simplerExplain(hit['_explanation']))
                    print("*************************************")
    
    if verbose:
        httpResp = requests.get('http://localhost:9200' + 
                    '/tmdb/movie/_validate/query?explain',
                     json={'query': query['query']})
        print(json.loads(httpResp.text))

## Reindex
## Reindex takes analyzer and field mappings, recreates the index, and then reindexes
## TMDB movies using the _bulk index API. There are other ways for modifying the configuration
## of the index besides dropping and restarting, however for convenience and because our data
## isn't truly that large, we'll just delete and start from scratch when we need to.
def reindex(analysisSettings, mappingSettings=None, movieDict={}):
    # Destroy any existing index (equiv to SQL "drop table")
    resp = requests.delete("http://localhost:9200/tmdb")
    print("Delete TMDB Index <%s>" % resp.status_code)
    
    # Create the index with explicit settings
    # We need to explicitely set number of shards to 1 to eliminate the impact of 
    # distributed IDF on our small collection
    # See also "Relavance is Broken!"
    # http://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-is-broken.html
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis" : analysisSettings,
            }
        }
    }
    if mappingSettings:
        settings['mappings'] = mappingSettings
    resp = requests.put("http://localhost:9200/tmdb", json=settings)
    print("Create TMDB Index <%s>" % resp.status_code)
    if resp.status_code != 200:
        print(resp.text)
    
    # Bulk index title & overview to the movie endpoint
    print("Indexing %i movies" % len(movieDict.keys()))
    bulkMovies = ""
    for id, movie in movieDict.items():
        addCmd = {"index": {"_index": "tmdb", "_type": "movie", "_id": movie["id"]}}
        esDoc  = movie
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(esDoc) + "\n"
    resp = requests.post("http://localhost:9200/_bulk", data=bulkMovies, headers={"Content-Type":"application/x-ndjson"})
    print("Bulk Index into TMDB Index <%s>" % resp.status_code)

## Extract
## major difference between our use of TMDB here and in chapter 3: pulling more data. Not only do we access the 
## movie endpoint, we also extract the credits -- pulling in the cast (actors and such) and extracting the director.
def extract(movieIds=[], numMovies=10000):
    if len(movieIds) == 0:
        try:
            f = open('tmdb.json')
            if f:
                return json.loads(f.read());
        except IOError:
            pass       
    return movieDict

# Index to ES, Chapter 5 Settings

In [2]:
movieDict = extract([])

analysisSettings = {
   "analyzer" : {
      "default" : {
        "type" : "english"
      },
      "english_bigrams": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "standard",
            "lowercase",
            "porter_stem",
            "bigram_filter"
          ]
      }
    },
  "filter": {
    "bigram_filter": {
        "type": "shingle",
        "max_shingle_size":2,
        "min_shingle_size":2,
        "output_unigrams":"false"
    }
  }
}

            
mappingSettings = {
    'movie': {
        'properties': {
            "cast": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            },
            "directors": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            }            
        }
    }
}

reindex(analysisSettings, mappingSettings, movieDict)

Delete TMDB Index <200>
Create TMDB Index <200>
Indexing 3051 movies
Bulk Index into TMDB Index <200>


# 6.2.2, Listing 3 -- Most Fields undue promotion due to director AND cast member

In [3]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    },
    'size': 5,
    'explain': True
}
search(query, True)

Num	Relevance Score		Movie Title
1	22.85533		Star Trek: Generations
Star Trek: Generations
Boldly go.
Captain Jean-Luc Picard and the crew of the Enterprise-D find themselves at odds with the renegade scientist Soran who is destroying entire star systems. Only one man can help Picard stop Soran's scheme...and he's been dead for seventy-eight years.
193
DIRS ['David Carson']
CAST ['Patrick Stewart', 'Jonathan Frakes', 'Brent Spiner', 'LeVar Burton', 'Michael Dorn', 'Gates McFadden', 'Marina Sirtis', 'William Shatner', 'James Doohan', 'Walter Koenig', 'Malcolm McDowell', 'Alan Ruck', 'Whoopi Goldberg', 'Thomas Dekker', 'Cameron Oppenheimer', 'Jenette Goldstein', 'Tim Russ']
CHAR ['Captain Jean-Luc Picard', 'Commander William T. Riker', 'Lt. Commander Data', 'Lt. Commander Geordi La Forge', 'Lt. Commander Worf', 'Dr. Beverly Crusher', 'Commander Deanna Troi', 'James T. Kirk', 'Montgomery Scott', 'Pavel Chekov', 'Dr. Tolian Soran', 'Capt. John Harriman', 'Guinan', "Picard's Kid", 'Ensign K

# 6.3.2, Listing 7 Query Parser

In [4]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'query_string': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed', 'directors.name.bigramed'],      
         }
    },
    'size': 5,
    'explain': True
}
search(query, True)


Num	Relevance Score		Movie Title
1	14.812269		Hannah Montana: The Movie
Hannah Montana: The Movie
She has the best of both worlds...now she has to choose just one.
When Miley Stewart (aka pop-star Hannah Montana) gets too caught up in the superstar celebrity lifestyle, her dad decides it's time for a total change of scenery. But sweet nibblets! Miley must trade in all the glitz and glamour of Hollywood for some ol' blue jeans on the family farm in Tennessee, and question if she can be both Miley Stewart and Hannah Montana. With a little help from her friends – and awesome guest stars Taylor Swift, Rascal Flatts and Vanessa Williams – will she figure out whether to choose Hannah or Miley?
18126
DIRS ['Peter Chelsom']
CAST ['Miley Cyrus', 'Billy Ray Cyrus', 'Emily Osment', 'Jason Earles', 'Taylor Swift', 'Lucas Till', 'Mitchel Musso', 'Moisés Arias', 'Vanessa Williams', 'Margo Martindale', 'Melora Hardin', 'Brooke Shields', 'Brandon Cyrus']
CHAR ['Miley Stewart/Hannah Montana', 'Billy Ra

# Listing 8 -- Searching fields that work in sync

In [5]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'query_string': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
  			     'cast.name', 'directors.name'],  #A    
         }
    },
    'size': 5,
    'explain': True
}
search(query)


Num	Relevance Score		Movie Title
1	14.812269		Hannah Montana: The Movie
2	13.610828		Star Trek: Generations
3	13.400371		Star Trek V: The Final Frontier
4	9.460281		Star Trek IV: The Voyage Home
5	8.666756		Star Trek: Insurrection


# 6.3.5, Listing 9 -- Tuning Term-Centric Search

In [6]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'query_string': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 
   'cast.name^10', 'directors.name'], #A
           }
    },
    'size': 5,
    'explain': True
}
search(query) 

Num	Relevance Score		Movie Title
1	136.10828		Star Trek: Generations
2	85.860634		Miss Congeniality 2: Armed and Fabulous
3	83.55811		Conspiracy Theory
4	83.12719		Star Trek II: The Wrath of Khan
5	82.90338		Bill & Ted's Bogus Journey


# 6.4.1, Listings 10&11 Combining Fields into Custom All Fields

In [7]:
mappingSettings = {
    'movie': {
        'properties': {
            # Because of a bug, you have to be very explicit
            # about analyzers for cross_field search
            "title": {
              'type': 'text',
              'analyzer': 'english',
            },
            "overview": {
              'type': 'text',
              'analyzer': 'english',
            },            
            "people": {
              'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams",
                        }     
                      }
                   }
                   
               }                       
            },
            "cast": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'copy_to': 'people.name',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            },
            "directors": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'copy_to': 'people.name',                      
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            }            
        }
    }
}

reindex(analysisSettings, mappingSettings, movieDict)

Delete TMDB Index <200>
Create TMDB Index <200>
Indexing 3051 movies
Bulk Index into TMDB Index <200>


# 6.4.1, Listing 12 -- Simple use of a custom all field

In [8]:
usersSearch = 'patrick stewart william shatner'
query = {
    'query': {
        'match': { 
            'people.name': usersSearch,  #User's query
         }
    },
    'size': 5,
    'explain': True
}
search(query)

Num	Relevance Score		Movie Title
1	13.336635		Star Trek: Generations
2	9.898848		Star Trek V: The Final Frontier
3	8.233552		Bill & Ted's Bogus Journey
4	8.11424		Miss Congeniality 2: Armed and Fabulous
5	7.828116		Star Trek II: The Wrath of Khan


# Listing 13 -- Searching _all

In [9]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'match': { 
            '_all': usersSearch,  #User's query
         }
    },
    'size': 5,
    'explain': True
}
search(query)


Num	Relevance Score		Movie Title


# 6.4.2, Listing 14 -- Cross Field Search over useful fields

In [10]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name', 'directors.name'],
            'type': 'cross_fields',
         }
    },
    'size': 5,
    'explain': True
}
search(query,True)

Num	Relevance Score		Movie Title
1	20.928247		Star Trek: Generations
Star Trek: Generations
Boldly go.
Captain Jean-Luc Picard and the crew of the Enterprise-D find themselves at odds with the renegade scientist Soran who is destroying entire star systems. Only one man can help Picard stop Soran's scheme...and he's been dead for seventy-eight years.
193
DIRS ['David Carson']
CAST ['Patrick Stewart', 'Jonathan Frakes', 'Brent Spiner', 'LeVar Burton', 'Michael Dorn', 'Gates McFadden', 'Marina Sirtis', 'William Shatner', 'James Doohan', 'Walter Koenig', 'Malcolm McDowell', 'Alan Ruck', 'Whoopi Goldberg', 'Thomas Dekker', 'Cameron Oppenheimer', 'Jenette Goldstein', 'Tim Russ']
CHAR ['Captain Jean-Luc Picard', 'Commander William T. Riker', 'Lt. Commander Data', 'Lt. Commander Geordi La Forge', 'Lt. Commander Worf', 'Dr. Beverly Crusher', 'Commander Deanna Troi', 'James T. Kirk', 'Montgomery Scott', 'Pavel Chekov', 'Dr. Tolian Soran', 'Capt. John Harriman', 'Guinan', "Picard's Kid", 'Ensign 

# 6.5.1 -- Listing 15 -- Our Search combining term-centric all field (people.name) w/ other fields 

In [11]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'people.name'],
            'type': 'most_fields',
         }
    },
    'size': 5,
    'explain': True
}
search(query)

Num	Relevance Score		Movie Title
1	24.762457		Star Trek: Generations
2	22.55778		Star Trek IV: The Voyage Home
3	18.088171		Star Trek: Nemesis
4	16.215742		Hannah Montana: The Movie
5	16.101814		Star Trek V: The Final Frontier


# 6.5.2, Listing 16 -- Searching two field groupings – people and text

In [12]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'bool': {
            'should': [ #A
                {
                   'multi_match': { 
                      'query': usersSearch,  #User's query
                      'fields': ['directors.name.bigramed', #B 
                                 'cast.name.bigramed'],
                      'type': 'cross_fields'
                      }
                 },
                {
                   'multi_match': {
                     'query': usersSearch,  #User's query
                     'fields': ['overview', 'title'],
                     'type': 'cross_fields'                                
                   }
                },
            ]
        }
    },
    'size': 5,
    'explain': True
}
search(query) 

Num	Relevance Score		Movie Title
1	19.84193		Star Trek IV: The Voyage Home
2	18.392088		Star Trek: Generations
3	16.154388		Star Trek: Nemesis
4	14.226931		Hannah Montana: The Movie
5	13.281178		Star Trek: Insurrection


# 6.5.3 Listing 17 Greedy Term-Centric Paired With Highly Discriminating Like Fields

In [13]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'bool': {
            'should': [ #A
                {
                   'multi_match': { 
                      'query': usersSearch,  #User's query
                      'fields': ['directors.name.bigramed', #B 
				      'cast.name.bigramed'],
                      'type': 'cross_fields'
                   }
                 },
                {
                   'multi_match': {
                     'query': usersSearch,  #User's query
                     'fields': ['overview', 'title', #C
                                 'directors.name', 'cast.name'],
                     'type': 'cross_fields'                                
                   }
                },
            ]
        }
    },
    'size': 5,
    'explain': True
}
search(query) 


Num	Relevance Score		Movie Title
1	31.752617		Star Trek: Generations
2	20.498749		Star Trek: Insurrection
3	20.33969		Star Trek: Nemesis
4	19.107498		Star Trek: First Contact
5	19.104988		Star Trek II: The Wrath of Khan
