# Setup

In [1]:
from elasticsearch import (
    Elasticsearch,
    helpers
)
import pickle

In [2]:
#initialize
es = Elasticsearch()

In [3]:
movies=pickle.load(open("../../movies_list.p","rb"))

IOError: [Errno 2] No such file or directory: '../../movies_list.p'

In [4]:
try:
    es.indices.delete("tmdb")
except:
    pass

#create index here?
# genres.name needs to be keyword tokenized so that 'science fiction' doesn't get split on white space
# maybe create a text field with title and overview to search against
body = {
    "mappings": {
      "movie": {
        "properties": {
          "genres": {
            "properties": {
              "name": { 
                "type": "string",
                "index": "not_analyzed"}}},
          "title": {
            "type": "string",
            "analyzer": "english"}}}}}
es.indices.create("tmdb",body=body)

{u'acknowledged': True}

In [5]:
#doc indexer
def format_doc(doc):
    action = {
        "_index": "tmdb",
        "_type": "movie",
        "_id": doc['id'],
        "_source": doc
        }
    return action

def index_movies():
    actions = (format_doc(doc) for doc in movies)
    results = [details for success,details in helpers.streaming_bulk(es, actions) if not success]
    return results

In [6]:
results = index_movies()

# Examples

In [28]:
# basic facet example
body = {
    "aggs": {
        "genres": {
            "terms": {
                "field": "genres.name"
            }
        },
    }
}
es.search(index="tmdb",body=body,size=0)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'genres': {u'buckets': [{u'doc_count': 7546,
     u'key': u'Drama'},
    {u'doc_count': 5342, u'key': u'Comedy'},
    {u'doc_count': 3878, u'key': u'Thriller'},
    {u'doc_count': 3753, u'key': u'Action'},
    {u'doc_count': 2623, u'key': u'Romance'},
    {u'doc_count': 2165, u'key': u'Adventure'},
    {u'doc_count': 1981, u'key': u'Horror'},
    {u'doc_count': 1861, u'key': u'Crime'},
    {u'doc_count': 1640, u'key': u'Family'},
    {u'doc_count': 1597, u'key': u'Science Fiction'}],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 7479}},
 u'hits': {u'hits': [], u'max_score': 0.0, u'total': 19868},
 u'timed_out': False,
 u'took': 7}

In [4]:
# facet with a filter in place - notice the different aggregation numbers
body = {
    "fields":["title"],
    "query": {
        "bool": {
            "filter": [
              {"term": {"genres.name": "Science Fiction"}}
            ]
        }
    },
    "aggs": {
        "genres": {
            "terms": {
                "field": "genres.name"
            }
        },
    }

}
es.search(index="tmdb",body=body,size=5)

{u'_shards': {u'failed': 0, u'successful': 1, u'total': 1},
 u'aggregations': {u'genres': {u'buckets': [],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 0}},
 u'hits': {u'hits': [], u'max_score': None, u'total': 0},
 u'timed_out': False,
 u'took': 96}

# Scratch

In [14]:
# see how the genre doc conuts tum to 209 while the original language counts sum to 90 (the num docs)
body = {
    "fields":["title"],
    "query":{
        "match":{
            "genres.name": "Science Fiction"}},
    "aggs": {
        "genres": {
            "terms": {
                "field": "genres.name"
            }
        },
    }

}
es.search(index="tmdb",body=body,size=100)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'genres': {u'buckets': [{u'doc_count': 1597,
     u'key': u'Science Fiction'},
    {u'doc_count': 753, u'key': u'Action'},
    {u'doc_count': 502, u'key': u'Thriller'},
    {u'doc_count': 466, u'key': u'Adventure'},
    {u'doc_count': 337, u'key': u'Drama'},
    {u'doc_count': 336, u'key': u'Fantasy'},
    {u'doc_count': 327, u'key': u'Horror'},
    {u'doc_count': 299, u'key': u'Comedy'},
    {u'doc_count': 188, u'key': u'Animation'},
    {u'doc_count': 164, u'key': u'Family'}],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 361}},
 u'hits': {u'hits': [{u'_id': u'19',
    u'_index': u'tmdb',
    u'_score': 3.5879924,
    u'_type': u'movie',
    u'fields': {u'title': [u'Metropolis']}},
   {u'_id': u'38',
    u'_index': u'tmdb',
    u'_score': 3.5879924,
    u'_type': u'movie',
    u'fields': {u'title': [u'Eternal Sunshine of the Spotless Mind']}},
   {u'_id': u'95',
    u'_index': u'tmdb',


In [16]:
# facet with a filter in place
body = {
    "fields":["title"],
    "query": {
        "filtered": {
            "query": {
                "match":{
                    "title": "star trek"}},
            "filter": {
                "term": {
                    "genres.name": "Science Fiction"}}}},
    "aggs": {
        "genres": {
            "terms": {
                "field": "genres.name"
            }
        },
    }

}
es.search(index="tmdb",body=body,size=100)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'genres': {u'buckets': [{u'doc_count': 36,
     u'key': u'Science Fiction'},
    {u'doc_count': 28, u'key': u'Adventure'},
    {u'doc_count': 27, u'key': u'Action'},
    {u'doc_count': 9, u'key': u'Thriller'},
    {u'doc_count': 6, u'key': u'Fantasy'},
    {u'doc_count': 4, u'key': u'Animation'},
    {u'doc_count': 4, u'key': u'Comedy'},
    {u'doc_count': 3, u'key': u'Drama'},
    {u'doc_count': 2, u'key': u'Family'},
    {u'doc_count': 1, u'key': u'Mystery'}],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 1}},
 u'hits': {u'hits': [{u'_id': u'13475',
    u'_index': u'tmdb',
    u'_score': 6.032624,
    u'_type': u'movie',
    u'fields': {u'title': [u'Star Trek']}},
   {u'_id': u'152',
    u'_index': u'tmdb',
    u'_score': 5.22185,
    u'_type': u'movie',
    u'fields': {u'title': [u'Star Trek: The Motion Picture']}},
   {u'_id': u'193',
    u'_index': u'tmdb',
    u'_score': 5.0279865,


#Trash

In [25]:
{ "query": {
    "filtered": {
      "filter": {
        "term": {
          "genres.name": "Science Fiction"}}}},
  "aggs": {
    "genres": {
      "terms": {
        "field": "genres.name"}}}}


{'aggs': {'genres': {'terms': {'field': 'genres.name'}}},
 'query': {'filtered': {'filter': {'term': {'genres.name': 'Science Fiction'}}}}}

In [27]:
x= [
    {'doc_count': 7546, 'key': 'Drama'},
    {'doc_count': 5342, 'key': 'Comedy'},
    {'doc_count': 3878, 'key': 'Thriller'},
    {'doc_count': 3753, 'key': 'Action'},
    {'doc_count': 2623, 'key': 'Romance'},
    {'doc_count': 2165, 'key': 'Adventure'},
    {'doc_count': 1981, 'key': 'Horror'},
    {'doc_count': 1861, 'key': 'Crime'},
    {'doc_count': 1640, 'key': 'Family'},
    {'doc_count': 1597, 'key': 'Science Fiction'}]
sum([y['doc_count'] for y in x])

32386

In [5]:
es.cluster.stats()['indices'].keys()

[u'count',
 u'completion',
 u'fielddata',
 u'docs',
 u'segments',
 u'shards',
 u'id_cache',
 u'filter_cache',
 u'percolate',
 u'store']