# Setup Code (Listings 3 & 4) 5.2.1

In [1]:
import requests
import json
import os

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result


# To speed up the pace of development, we really need to focus more heavily on the analysis and query
# settings of the search engine, rather than fidly bits of the http interface.
#
# To that end, we're going to collapse some of the code you were introduced to in chapter 3 into more general functions,
# so we can reuse them. Largely, this is the exact same code you saw in chapter 3 some more generality.

## Analyze
## The analyze function is a helper for accessing the _analyze endpoint like we did in chapter 3. Recall,
## given a field or analyzer, passing some text to _analyze will return the token stream that results from
## that analyzer. This token stream, if you recall, shows us exactly how the search engine translate text
## into individual tokens to be consumed by the underlying data structures. When we debug analysis, we see
## matches we need to expect.
def analyze(text, field=None, analyzer=None):
    whatToAnalyze = ''
    if field is not None:
        whatToAnalyze = "field=%s" % field
    elif analyzer is not None:
        whatToAnalyze = "analyzer=%s" % analyzer
    resp = requests.get("http://localhost:9200/tmdb/_analyze?%s&format=yaml" % whatToAnalyze, 
                        data=text)
    print resp.text
    
## Search
## Next we need to wrap up our execution of query DSL queries. The function 'search' will execute the passed query DSL
## query and display the results. 
## If a scoring explain is associated with the results, then it also gets displayed,
## We'll also be sure to dump the query DSL
def search(query, verbose=False):
    url = 'http://localhost:9200/tmdb/movie/_search'
    httpResp = requests.get(url, data=json.dumps(query))
    if httpResp.status_code != 200:
        print "Search Failed <%s>" % httpResp.status_code
        print "%s" % httpResp.text
    searchHits = json.loads(httpResp.text)['hits']
    print "Num\tRelevance Score\t\tMovie Title"
    for idx, hit in enumerate(searchHits['hits']):
            castNames = []            
            castCharacters = []                        
            directorNames = []
            for cast in hit['_source']['cast']:
                castNames.append(cast['name'])
                castCharacters.append(cast['character'])
            for director in hit['_source']['directors']:
                directorNames.append(director['name'])
            print "%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title'])
            if verbose:
                print "%s" % hit['_source']['title']
                print "%s" % hit['_source']['tagline']        
                print "%s" % hit['_source']['overview']        
                print "%s" % hit['_id']
                print "DIRS %s" % directorNames
                print "CAST %s" % castNames
                print "CHAR %s" % castCharacters
                if '_explanation' in hit:
                    print "%s" % simplerExplain(hit['_explanation'])
                    print "*************************************"
    
    if verbose:
        httpResp = requests.get('http://localhost:9200' + 
                    '/tmdb/movie/_validate/query?explain',
                     data=json.dumps({'query': query['query']}))
        print json.loads(httpResp.text)

## Reindex
## Reindex takes analyzer and field mappings, recreates the index, and then reindexes
## TMDB movies using the _bulk index API. There are other ways for modifying the configuration
## of the index besides dropping and restarting, however for convenience and because our data
## isn't truly that large, we'll just delete and start from scratch when we need to.
def reindex(analysisSettings, mappingSettings=None, movieDict={}):
    # Destroy any existing index (equiv to SQL "drop table")
    resp = requests.delete("http://localhost:9200/tmdb")
    print "Delete TMDB Index <%s>" % resp.status_code
    
    # Create the index with explicit settings
    # We need to explicitely set number of shards to 1 to eliminate the impact of 
    # distributed IDF on our small collection
    # See also "Relavance is Broken!"
    # http://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-is-broken.html
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis" : analysisSettings,
            }
        }
    }
    if mappingSettings:
        settings['mappings'] = mappingSettings
    resp = requests.put("http://localhost:9200/tmdb", data=json.dumps(settings))
    print "Create TMDB Index <%s>" % resp.status_code
    if resp.status_code != 200:
        print resp.text
    
    # Bulk index title & overview to the movie endpoint
    print "Indexing %i movies" % len(movieDict.keys())
    bulkMovies = ""
    for id, movie in movieDict.iteritems():
        addCmd = {"index": {"_index": "tmdb", "_type": "movie", "_id": movie["id"]}}
        esDoc  = movie
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(esDoc) + "\n"
    resp = requests.post("http://localhost:9200/_bulk", data=bulkMovies)
    print "Bulk Index into TMDB Index <%s>" % resp.status_code

## Extract
## major difference between our use of TMDB here and in chapter 3: pulling more data. Not only do we access the 
## movie endpoint, we also extract the credits -- pulling in the cast (actors and such) and extracting the director.
def extract(movieIds=[], numMovies=10000):
    if len(movieIds) == 0:
        try:
            f = open('tmdb.json')
            if f:
                return json.loads(f.read());
        except IOError:
            pass       
    return movieDict

# 5.2.2 -- Listing 4, Index to ES, Search

In [2]:
movieDict = extract()

analysis = {
    "analyzer" : {
      "default" : {
        "type" : "english"
        }
      }
   }

reindex(analysisSettings=analysis, movieDict=movieDict)

usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
         }
    },
    'size': 5,
    'explain': True
}
search(query)
print "==============="
search(query, verbose=True)

Delete TMDB Index <200>
Create TMDB Index <200>
Indexing 3051 movies
Bulk Index into TMDB Index <200>
Num	Relevance Score		Movie Title
1	1.0018479		Space Jam
2	0.29904303		Grown Ups
3	0.27935582		Speed Racer
4	0.27935582		The Flintstones
5	0.2563226		White Men Can't Jump
Num	Relevance Score		Movie Title
1	1.0018479		Space Jam
Space Jam
Get ready to jam.
Michael Jordan agrees to help the Looney Tunes play a basketball game against alien slavers to determine their freedom.
2300
DIRS [u'Joe Pytka']
CAST [u'Michael Jordan', u'Wayne Knight', u'Billy West', u'Dee Bradley Baker', u'Theresa Randle', u'Danny DeVito', u'Brandon Hammond', u'Larry Bird', u'Bill Murray', u'Charles Barkley', u'Patrick Ewing', u'Tyrone Bogues', u'Larry Johnson', u'Shawn Bradley', u'Ahmad Rashad', u'Del Harris', u'Vlade Divac', u'Cedric Ceballos', u'Jim Rome', u'Paul Westphal', u'Danny Ainge', u'Alonzo Mourning', u'A.C. Green', u'Charles Oakley', u'Derek Harper', u'Jeff Malone', u'Anthony Miller', u'Sharone Wright']
C

# 5.2.4 -- Listing 5 Inspecting Nested Star Trek Docs

In [14]:
spaceJamId = 2300
httpResp = requests.get("http://localhost:9200/tmdb/movie/%s" % spaceJamId)
spaceJamDoc = json.loads(httpResp.text)
print json.dumps(spaceJamDoc['_source'], indent=True)

{
 "poster_path": "/9T9ucCk6wO0crRBUIkBJMRAVcKp.jpg", 
 "production_countries": [
  {
   "iso_3166_1": "US", 
   "name": "United States of America"
  }
 ], 
 "revenue": 230000000, 
 "overview": "Michael Jordan agrees to help the Looney Tunes play a basketball game against alien slavers to determine their freedom.", 
 "video": false, 
 "id": 2300, 
 "genres": [
  {
   "id": 16, 
   "name": "Animation"
  }, 
  {
   "id": 35, 
   "name": "Comedy"
  }, 
  {
   "id": 18, 
   "name": "Drama"
  }, 
  {
   "id": 14, 
   "name": "Fantasy"
  }, 
  {
   "id": 10751, 
   "name": "Family"
  }
 ], 
 "title": "Space Jam", 
 "tagline": "Get ready to jam.", 
 "vote_count": 275, 
 "homepage": "", 
 "belongs_to_collection": null, 
 "original_language": "en", 
 "status": "Released", 
 "spoken_languages": [
  {
   "iso_639_1": "cs", 
   "name": "\u010cesk\u00fd"
  }, 
  {
   "iso_639_1": "fr", 
   "name": "Fran\u00e7ais"
  }, 
  {
   "iso_639_1": "pl", 
   "name": "Polski"
  }, 
  {
   "iso_639_1": "en", 


# 5.3.1, Listing 6 Star Trek Query Using Query from Ch 3

In [15]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name', 'directors.name'],      
            'type': 'best_fields'
         }
    },
    'size': 50,
    'explain': True
}
search(query)
print "==============="
search(query, verbose=True)

Num	Relevance Score		Movie Title
1	1.346026		Legion
2	1.346026		Halo 4: Forward Unto Dawn
3	1.346026		Priest
4	1.346026		Dark Skies
5	1.0749499		Drive Angry
6	1.0749499		Feast
7	1.0749499		District 13: Ultimatum
8	1.0749499		The Expendables 3
9	1.0749499		Underworld: Rise of the Lycans
10	1.0749499		My Bloody Valentine
11	0.8599599		Sex and the City 2
12	0.8599599		Sex and the City
13	0.8599599		Doubt
14	0.8599599		Sinbad: Legend of the Seven Seas
15	0.46373135		Vertigo
16	0.46373135		Star Trek: Insurrection
17	0.46373135		Gnomeo & Juliet
18	0.46373135		Star Trek: First Contact
19	0.46373135		Excalibur
20	0.38982356		X-Men: Days of Future Past
21	0.38644278		Conspiracy Theory
22	0.38644278		The Wolverine
23	0.38644278		Star Trek: Nemesis
24	0.38644278		Dune
25	0.38644278		Star Trek: Generations
26	0.38644278		X-Men
27	0.38644278		Panic Room
28	0.38644278		Robin Hood: Men in Tights
29	0.35654074		Ted
30	0.3542312		One Flew Over the Cuckoo's Nest
31	0.32015914		The Bounty Hunter
32	0.316

# 5.3.2 -- Listing 7 -- Reducing the Impact of directors.name

In [16]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
                       'cast.name', 'directors.name^0.1'],  #A    
         }
    },
}
search(query)


Num	Relevance Score		Movie Title
1	0.46373135		Vertigo
2	0.46373135		Star Trek: Insurrection
3	0.46373135		Gnomeo & Juliet
4	0.46373135		Star Trek: First Contact
5	0.46373135		Excalibur
6	0.38982356		X-Men: Days of Future Past
7	0.38644278		Conspiracy Theory
8	0.38644278		The Wolverine
9	0.38644278		Star Trek: Nemesis
10	0.38644278		Dune


# 5.3.3 -- Listings 8&9  – Analysis Extracting English Bigrams

In [4]:
analysisSettings = {
   "analyzer" : {
      "default" : {
        "type" : "english"
      },
      "english_bigrams": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "standard",
            "lowercase",
            "porter_stem",
            "bigram_filter"
          ]
      }
    },
  "filter": {
    "bigram_filter": {
        "type": "shingle",
        "max_shingle_size":2,
        "min_shingle_size":2,
        "output_unigrams":"false"
    }
  }
}


# From listing 9
mappingSettings = {
    'movie': {
        'properties': {
            "cast": {
               'properties': {
                  'name': {
                      'type': 'string',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "string",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            },
            "directors": {
               'properties': {
                  'name': {
                      'type': 'string',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "string",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            }            
        }
    }
}

reindex(analysisSettings, mappingSettings, movieDict)

Delete TMDB Index <200>
Create TMDB Index <200>
Indexing 3051 movies
Bulk Index into TMDB Index <200>


# 5.3.3 -- Listing 10 -- Searching *.bigramed fields, reindexing

In [5]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
'cast.name.bigramed', 'directors.name.bigramed'],      
         }
    },
}
search(query)


Num	Relevance Score		Movie Title
1	0.7283101		Star Trek: Insurrection
2	0.7283101		Gnomeo & Juliet
3	0.7283101		Star Trek: First Contact
4	0.7283101		Excalibur
5	0.63727134		Conspiracy Theory
6	0.63727134		The Wolverine
7	0.63727134		Star Trek: Nemesis
8	0.63727134		Dune
9	0.63727134		Star Trek: Generations
10	0.63727134		X-Men


# 5.3.4	Letting Losers Share The Glory (no listing number)

In [46]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed^5', 'directors.name.bigramed'],      
            'type': 'best_fields',
            'tie_breaker': 0.4
         }
    },
    'size': 5,
    'explain': True
}
search(query)
print "==============="
search(query, verbose=True)

Num	Relevance Score		Movie Title
1	0.35363546		Star Trek: Insurrection
2	0.35363546		Star Trek: First Contact
3	0.34679613		Star Trek: Generations
4	0.34285474		Star Trek: Nemesis
5	0.33423716		Star Trek
Num	Relevance Score		Movie Title
1	0.35363546		Star Trek: Insurrection
Star Trek: Insurrection
The battle for paradise has begun.
When an alien race and factions within Starfleet attempt to take over a planet that has "regenerative" properties, it falls upon Captain Picard and the crew of the Enterprise to defend the planet's people as well as the very ideals upon which the Federation itself was founded.
200
DIRS [u'Jonathan Frakes']
CAST [u'Patrick Stewart', u'Jonathan Frakes', u'Brent Spiner', u'LeVar Burton', u'Gates McFadden', u'Marina Sirtis', u'F. Murray Abraham', u'Anthony Zerbe', u'Donna Murphy', u'Gregg Henry', u'Michael Dorn']
CHAR [u'Captain Jean-Luc Picard', u'Commander William T. Riker', u'Lt. Commander Data', u'Lt. Commander Geordi La Forge', u'Doctor Beverly Crusher', u'

# 5.3.5, Listing 11 Counting Multiple Signals using Most Fields 

In [20]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    }
}
search(query)


Num	Relevance Score		Movie Title
1	0.57795894		Star Trek: Generations
2	0.37984636		Star Trek: Insurrection
3	0.37984636		Star Trek: First Contact
4	0.37325242		Star Trek: Nemesis
5	0.20443419		Star Trek
6	0.16354734		Star Trek: The Motion Picture
7	0.16354734		Star Trek Into Darkness
8	0.14310393		Star Trek VI: The Undiscovered Country
9	0.14310393		Star Trek V: The Final Frontier
10	0.14310393		Star Trek IV: The Voyage Home


# 5.3.6, Listing 12	Boosting in Most-Fields

In [21]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.2', 'overview',
 'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    },
}
search(query)

Num	Relevance Score		Movie Title
1	0.21698442		Star Trek: Generations
2	0.13826816		Star Trek: Insurrection
3	0.13826816		Star Trek: First Contact
4	0.13055275		Star Trek: Nemesis
5	0.054496057		Hannah Montana: The Movie
6	0.04985989		Ted
7	0.047840547		Star Trek
8	0.044039465		The Beaver
9	0.038272437		Star Trek: The Motion Picture
10	0.038272437		Star Trek Into Darkness


# 5.3.7	When Additional Matches Don’t Matter (no listing number)

In [22]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    },
    'size': 5,
    'explain': True
}
search(query)

Num	Relevance Score		Movie Title
1	0.5415871		Star Trek V: The Final Frontier
2	0.39785004		Star Trek: Generations
3	0.35108924		Star Trek IV: The Voyage Home
4	0.3037074		Star Trek: Nemesis
5	0.19542062		Star Trek: Insurrection
