# Setup Code (Listings 3 & 4) 5.2.1

In [107]:
import requests
import json
import os

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result


# To speed up the pace of development, we really need to focus more heavily on the analysis and query
# settings of the search engine, rather than fidly bits of the http interface.
#
# To that end, we're going to collapse some of the code you were introduced to in chapter 3 into more general functions,
# so we can reuse them. Largely, this is the exact same code you saw in chapter 3 some more generality.

## Analyze
## The analyze function is a helper for accessing the _analyze endpoint like we did in chapter 3. Recall,
## given a field or analyzer, passing some text to _analyze will return the token stream that results from
## that analyzer. This token stream, if you recall, shows us exactly how the search engine translate text
## into individual tokens to be consumed by the underlying data structures. When we debug analysis, we see
## matches we need to expect.
def analyze(text, field=None, analyzer=None):
    whatToAnalyze = ''
    if field is not None:
        whatToAnalyze = "field=%s" % field
    elif analyzer is not None:
        whatToAnalyze = "analyzer=%s" % analyzer
    resp = requests.get("http://localhost:9200/tmdb/_analyze?%s&format=yaml" % whatToAnalyze, 
                        data=text)
    print resp.text
    
## Search
## Next we need to wrap up our execution of query DSL queries. The function 'search' will execute the passed query DSL
## query and display the results. 
## If a scoring explain is associated with the results, then it also gets displayed,
## We'll also be sure to dump the query DSL
def search(query, verbose=False):
    url = 'http://localhost:9200/tmdb/_search'
    headers = {'Content-type': 'application/json', 'Accept': 'application/json'}    

    httpResp = requests.get(url, data=json.dumps(query), headers=headers)
    if httpResp.status_code != 200:
        print "Search Failed <%s>" % httpResp.status_code
        print "%s" % httpResp.text
    searchHits = json.loads(httpResp.text)['hits']
    print "Num\tRelevance Score\t\tMovie Title"
    for idx, hit in enumerate(searchHits['hits']):
            castNames = []            
            castCharacters = []                        
            directorNames = []
            for cast in hit['_source']['cast']:
                castNames.append(cast['name'])
                castCharacters.append(cast['character'])
            for director in hit['_source']['directors']:
                directorNames.append(director['name'])
            print "%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title'])
            if verbose:
                print "%s" % hit['_source']['title']
                print "%s" % hit['_source']['tagline']        
                print "%s" % hit['_source']['overview']        
                print "%s" % hit['_id']
                print "DIRS %s" % directorNames
                print "CAST %s" % castNames
                print "CHAR %s" % castCharacters
                if '_explanation' in hit:
                    print "%s" % simplerExplain(hit['_explanation'])
                    print "*************************************"
    
    if verbose:
        httpResp = requests.get('http://localhost:9200' + 
                    '/tmdb/movie/_validate/query?explain',
                     data=json.dumps({'query': query['query']}),
                               headers=headers)
        print json.loads(httpResp.text)

## Reindex
## Reindex takes analyzer and field mappings, recreates the index, and then reindexes
## TMDB movies using the _bulk index API. There are other ways for modifying the configuration
## of the index besides dropping and restarting, however for convenience and because our data
## isn't truly that large, we'll just delete and start from scratch when we need to.
def reindex(analysisSettings, mappingSettings=None, movieDict={}):
    # Destroy any existing index (equiv to SQL "drop table")
    headers = {'Content-type': 'application/json', 'Accept': 'application/json'}    

    resp = requests.delete("http://localhost:9200/tmdb", headers=headers)
    print "Delete TMDB Index <%s>" % resp.status_code
    
    # Create the index with explicit settings
    # We need to explicitely set number of shards to 1 to eliminate the impact of 
    # distributed IDF on our small collection
    # See also "Relavance is Broken!"
    # http://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-is-broken.html
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis" : analysisSettings,
            }
        }
    }
    if mappingSettings:
        settings['mappings'] = mappingSettings
    resp = requests.put("http://localhost:9200/tmdb", data=json.dumps(settings),headers=headers)
    print "Create TMDB Index <%s>" % resp.status_code
    if resp.status_code != 200:
        print resp.text
    
    # Bulk index title & overview to the movie endpoint
    print "Indexing %i movies" % len(movieDict.keys())
    bulkMovies = ""
    for id, movie in movieDict.iteritems():
        addCmd = {"index": {"_index": "tmdb", "_id": movie["id"]}}
        esDoc  = movie
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(esDoc) + "\n"
    headers = {'Content-type': 'application/json', 'Accept': 'application/json'}    
    resp = requests.post("http://localhost:9200/_bulk", data=bulkMovies, headers= headers)
    print "Bulk Index into TMDB Index <%s>" % resp.status_code

## Extract
## major difference between our use of TMDB here and in chapter 3: pulling more data. Not only do we access the 
## movie endpoint, we also extract the credits -- pulling in the cast (actors and such) and extracting the director.
def extract(movieIds=[], numMovies=10000):
    if len(movieIds) == 0:
        try:
            f = open('tmdb.json')
            if f:
                return json.loads(f.read());
        except IOError:
            pass       
    return movieDict

# 5.2.2 -- Listing 4, Index to ES, Search

In [108]:
movieDict = extract()

analysis = {
    "analyzer" : {
      "default" : {
        "type" : "english"
        }
      }
   }

reindex(analysisSettings=analysis, movieDict=movieDict)

usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
         }
    },
    'size': 5,
    'explain': True
}
search(query)
print "==============="
search(query, verbose=True)

Delete TMDB Index <200>
Create TMDB Index <200>
Indexing 3051 movies
Bulk Index into TMDB Index <200>
Num	Relevance Score		Movie Title
1	3.750381		Invasion of the Body Snatchers
2	3.6913202		Slither
3	3.652607		District 9
4	3.5247955		Escape from Planet Earth
5	2.9117908		Batteries Not Included
Num	Relevance Score		Movie Title
1	3.750381		Invasion of the Body Snatchers
Invasion of the Body Snatchers
... there was nothing to hold onto - except each other.
A small-town doctor learns that the population of his community is being replaced by emotionless alien duplicates.
11549
DIRS [u'Don Siegel']
CAST [u'Kevin McCarthy', u'Dana Wynter', u'Larry Gates', u'King Donovan', u'Carolyn Jones', u'Jean Willes', u'Ralph Dumke', u'Tom Fadden', u'Guy Way', u'Kenneth Patterson', u'Virginia Christine', u'Eileen Stevens', u'Bobby Clark', u'Whit Bissell', u'Richard Deacon', u'Dabbs Greer', u'Sam Peckinpah']
CHAR [u'Dr. Miles J. Bennell', u'Becky Driscoll', u"Dr. Dan 'Danny' Kauffman", u'Jack Belicec', u"

# 5.2.4 -- Listing 5 Inspecting Nested Star Trek Docs

In [109]:
spaceJamId = 2300
httpResp = requests.get("http://localhost:9200/tmdb/movie/%s" % spaceJamId)
spaceJamDoc = json.loads(httpResp.text)
print json.dumps(spaceJamDoc['_source'], indent=True)

KeyError: '_source'

# 5.3.1, Listing 6 Star Trek Query Using Query from Ch 3

In [110]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name', 'directors.name'],      
            'type': 'best_fields'
         }
    },
    'size': 50,
    'explain': True
}
search(query)
print "==============="
search(query, verbose=True)

Num	Relevance Score		Movie Title
1	7.1845765		Hannah Montana: The Movie
2	7.1596975		Vertigo
3	6.9732013		Star Trek: Insurrection
4	6.9093037		One Flew Over the Cuckoo's Nest
5	6.906671		Legion
6	6.906671		Halo 4: Forward Unto Dawn
7	6.906671		Priest
8	6.906671		Dark Skies
9	6.71099		Star Trek: First Contact
10	6.5468693		Gnomeo & Juliet
11	6.5468693		Excalibur
12	6.465148		X-Men: Days of Future Past
13	6.3152065		Panic Room
14	6.2415876		Conspiracy Theory
15	6.228061		The Bounty Hunter
16	6.0993795		Star Trek: Nemesis
17	6.0993795		Star Trek: Generations
18	6.0993795		Robin Hood: Men in Tights
19	6.030678		The Wolverine
20	6.009052		Drive Angry
21	6.009052		Feast
22	6.009052		District 13: Ultimatum
23	6.009052		The Expendables 3
24	6.009052		Underworld: Rise of the Lycans
25	6.009052		My Bloody Valentine
26	5.833557		X-Men
27	5.7091494		Dune
28	5.6086707		Ted
29	5.589937		Save the Last Dance
30	5.589937		TMNT
31	5.475602		X2: X-Men United
32	5.475602		The Prince of Egypt
33	5.443809		

# 5.3.2 -- Listing 7 -- Reducing the Impact of directors.name

In [111]:
usersSearch = 'patrick stewart'
query = {
    'explain': True,
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
                       'cast.name', 'directors.name^0.1'],  #A    
         }
    },
}
search(query)
print "==============="
search(query, verbose=True)


Num	Relevance Score		Movie Title
1	7.1845765		Hannah Montana: The Movie
2	7.1596975		Vertigo
3	6.9732013		Star Trek: Insurrection
4	6.9093037		One Flew Over the Cuckoo's Nest
5	6.71099		Star Trek: First Contact
6	6.5468693		Gnomeo & Juliet
7	6.5468693		Excalibur
8	6.465148		X-Men: Days of Future Past
9	6.3152065		Panic Room
10	6.2415876		Conspiracy Theory
Num	Relevance Score		Movie Title
1	7.1845765		Hannah Montana: The Movie
Hannah Montana: The Movie
She has the best of both worlds...now she has to choose just one.
When Miley Stewart (aka pop-star Hannah Montana) gets too caught up in the superstar celebrity lifestyle, her dad decides it's time for a total change of scenery. But sweet nibblets! Miley must trade in all the glitz and glamour of Hollywood for some ol' blue jeans on the family farm in Tennessee, and question if she can be both Miley Stewart and Hannah Montana. With a little help from her friends – and awesome guest stars Taylor Swift, Rascal Flatts and Vanessa Williams – 

# 5.3.3 -- Listings 8&9  – Analysis Extracting English Bigrams

In [112]:
movieDict = extract()

analysisSettings = {
   "analyzer" : {
      "default" : {
        "type" : "english"
      },
      "english_bigrams": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "porter_stem",
            "bigram_filter"
          ]
      }
    },
  "filter": {
    "bigram_filter": {
        "type": "shingle",
        "max_shingle_size":2,
        "min_shingle_size":2,
        "output_unigrams":"false"
    }
  }
}


# From listing 9
mappingSettings = {
        'properties': {
            "cast": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            },
            "directors": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            }            
        }
}
    

reindex(analysisSettings=analysisSettings, mappingSettings=mappingSettings, movieDict=movieDict)

Delete TMDB Index <200>
Create TMDB Index <200>
Indexing 3051 movies
Bulk Index into TMDB Index <200>


# 5.3.3 -- Listing 10 -- Searching *.bigramed fields, reindexing

In [113]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
'cast.name.bigramed', 'directors.name.bigramed'],      
         }
    },
}
search(query)


Num	Relevance Score		Movie Title
1	5.4837284		Conspiracy Theory
2	5.062707		The Bounty Hunter


# 5.3.4	Letting Losers Share The Glory (no listing number)

In [114]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed^5', 'directors.name.bigramed'],      
            'type': 'best_fields',
            'tie_breaker': 0.4
         }
    },
    'size': 5,
    'explain': True
}
search(query)
print "==============="
search(query, verbose=True)

Num	Relevance Score		Movie Title
1	33.893368		Star Trek: Insurrection
2	32.65286		Star Trek: First Contact
3	31.895023		Star Trek: Generations
4	30.539837		Star Trek: Nemesis
5	28.853365		Gnomeo & Juliet
Num	Relevance Score		Movie Title
1	33.893368		Star Trek: Insurrection
Star Trek: Insurrection
The battle for paradise has begun.
When an alien race and factions within Starfleet attempt to take over a planet that has "regenerative" properties, it falls upon Captain Picard and the crew of the Enterprise to defend the planet's people as well as the very ideals upon which the Federation itself was founded.
200
DIRS [u'Jonathan Frakes']
CAST [u'Patrick Stewart', u'Jonathan Frakes', u'Brent Spiner', u'LeVar Burton', u'Gates McFadden', u'Marina Sirtis', u'F. Murray Abraham', u'Anthony Zerbe', u'Donna Murphy', u'Gregg Henry', u'Michael Dorn']
CHAR [u'Captain Jean-Luc Picard', u'Commander William T. Riker', u'Lt. Commander Data', u'Lt. Commander Geordi La Forge', u'Doctor Beverly Crusher', u'C

# 5.3.5, Listing 11 Counting Multiple Signals using Most Fields 

In [115]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    }
}
search(query)


Num	Relevance Score		Movie Title
1	17.783155		Star Trek: Generations
2	15.065897		Star Trek: Insurrection
3	14.39519		Star Trek: Nemesis
4	13.657165		Star Trek: First Contact
5	10.878148		Hannah Montana: The Movie
6	10.760254		Star Trek
7	9.007851		Star Trek Into Darkness
8	8.387256		Maps to the Stars
9	8.1140175		Dinosaur
10	8.019917		The Beaver


# 5.3.6, Listing 12	Boosting in Most-Fields

In [121]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.2', 'overview^0.2',
 'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    },
}
search(query)

Num	Relevance Score		Movie Title
1	7.866503		Star Trek: Generations
2	7.8596163		Star Trek: Insurrection
3	7.4601283		Star Trek: First Contact
4	7.1889095		Star Trek: Nemesis
5	5.7706733		Gnomeo & Juliet
6	5.7706733		Excalibur
7	5.3873396		Conspiracy Theory
8	5.2706337		Robin Hood: Men in Tights
9	5.1588774		The Wolverine
10	5.1588774		X-Men


# 5.3.7	When Additional Matches Don’t Matter (no listing number)

In [124]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    },
    'size': 5,
    'explain': True
}
search(query, verbose=True)

Num	Relevance Score		Movie Title
1	23.29123		Star Trek: Generations
Star Trek: Generations
Boldly go.
Captain Jean-Luc Picard and the crew of the Enterprise-D find themselves at odds with the renegade scientist Soran who is destroying entire star systems. Only one man can help Picard stop Soran's scheme...and he's been dead for seventy-eight years.
193
DIRS [u'David Carson']
CAST [u'Patrick Stewart', u'Jonathan Frakes', u'Brent Spiner', u'LeVar Burton', u'Michael Dorn', u'Gates McFadden', u'Marina Sirtis', u'William Shatner', u'James Doohan', u'Walter Koenig', u'Malcolm McDowell', u'Alan Ruck', u'Whoopi Goldberg', u'Thomas Dekker', u'Cameron Oppenheimer', u'Jenette Goldstein', u'Tim Russ']
CHAR [u'Captain Jean-Luc Picard', u'Commander William T. Riker', u'Lt. Commander Data', u'Lt. Commander Geordi La Forge', u'Lt. Commander Worf', u'Dr. Beverly Crusher', u'Commander Deanna Troi', u'James T. Kirk', u'Montgomery Scott', u'Pavel Chekov', u'Dr. Tolian Soran', u'Capt. John Harriman', u'Gui