## Movie Indexing

In [1]:
import zipfile
import json
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch import helpers

import boto3
#from botocore.vendored import requests
#import requests
from elasticsearch import RequestsHttpConnection
from requests_aws4auth import AWS4Auth

In [2]:
df_movie = pd.read_csv('panda_movies_r1.csv')

In [3]:
df_movie.head()

Unnamed: 0,title,year,rating,rank,genres,plot,directors,actors,release_date,running_time_secs,image_url
0,(500) Days of Summer,2009,7.8,269,"['Comedy', 'Drama', 'Romance']",An offbeat romantic comedy about a woman who d...,['Marc Webb'],"['Zooey Deschanel', 'Joseph Gordon-Levitt', 'G...",2009-01-17,5700,http://ia.media-imdb.com/images/M/MV5BMTk5MjM4...
1,+1,2013,5.6,401,"['Sci-Fi', 'Thriller']",Three college friends hit the biggest party of...,['Dennis Iliadis'],"['Rhys Wakefield', 'Logan Miller', 'Ashley Hin...",2013-03-10,5700,http://ia.media-imdb.com/images/M/MV5BMTQwOTA5...
2,10,1979,5.9,2862,"['Comedy', 'Romance']",A Hollywood songwriter goes through a mid-life...,['Blake Edwards'],"['Dudley Moore', 'Bo Derek', 'Julie Andrews']",1979-10-05,7320,http://ia.media-imdb.com/images/M/MV5BMTg1NDQ1...
3,10 Items or Less,2006,6.6,4401,"['Comedy', 'Drama', 'Romance']",An actor (Freeman) prepping for an upcoming ro...,['Brad Silberling'],"['Morgan Freeman', 'Paz Vega', 'Jonah Hill']",2006-09-11,4920,http://ia.media-imdb.com/images/M/MV5BMTI1MTU4...
4,10 Rillington Place,1971,7.5,2605,"['Biography', 'Crime', 'Drama', 'Horror', 'Thr...",,['Richard Fleischer'],"['Richard Attenborough', 'Judy Geeson', 'John ...",1971-02-10,6660,http://ia.media-imdb.com/images/M/MV5BMTc4MzM5...


In [4]:
#subsection dataframe
df1= df_movie.iloc[0:1000]
df2= df_movie.iloc[1000:2000]
df3= df_movie.iloc[2000:3000]
df4= df_movie.iloc[3000:4000]
df5= df_movie.iloc[4000:5000]
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 4000 to 4999
Data columns (total 11 columns):
title                1000 non-null object
year                 1000 non-null int64
rating               1000 non-null float64
rank                 1000 non-null int64
genres               1000 non-null object
plot                 906 non-null object
directors            993 non-null object
actors               995 non-null object
release_date         1000 non-null object
running_time_secs    1000 non-null int64
image_url            944 non-null object
dtypes: float64(1), int64(3), object(7)
memory usage: 86.0+ KB


In [5]:
df5.tail()

Unnamed: 0,title,year,rating,rank,genres,plot,directors,actors,release_date,running_time_secs,image_url
4995,À bout de souffle,1960,7.9,3521,"['Crime', 'Drama']",A young car thief kills a policeman and tries ...,['Jean-Luc Godard'],"['Jean-Paul Belmondo', 'Jean Seberg', 'Daniel ...",1960-03-16,5400,http://ia.media-imdb.com/images/M/MV5BMTI4MDUw...
4996,À bout portant,2010,6.8,4937,"['Action', 'Crime', 'Thriller']",,['Fred Cavayé'],"['Gilles Lellouche', 'Roschdy Zem', 'Gérard La...",2010-11-04,5040,http://ia.media-imdb.com/images/M/MV5BMTcxMDA4...
4997,À l'intérieur,2007,6.8,2839,"['Horror', 'Thriller']","Four months after the death of her husband, a ...","['Alexandre Bustillo', 'Julien Maury']","['Alysson Paradis', 'Jean-Baptiste Tabourin', ...",2007-05-24,4920,http://ia.media-imdb.com/images/M/MV5BMjA2NDk4...
4998,Æon Flux,2005,5.4,1880,"['Action', 'Sci-Fi']",Aeon Flux is a mysterious assassin working for...,['Karyn Kusama'],"['Charlize Theron', 'Frances McDormand', 'Soph...",2005-12-01,5580,http://ia.media-imdb.com/images/M/MV5BMTc2Mzg0...
4999,Ôdishon,1999,7.2,3199,"['Horror', 'Thriller']",A widower takes an offer to screen girls at a ...,['Takashi Miike'],"['Ryo Ishibashi', 'Eihi Shiina', 'Tetsu Sawaki']",1999-10-06,6900,http://ia.media-imdb.com/images/M/MV5BMTQwNzQw...


In [6]:
#Create list of dfs
df_list = [df1,df2,df3,df4,df5]

### elasticsearch mapping for movies

In [7]:
#Universal Mapping
#Create mapping
mapping = '''
{
    "settings": {
    "index.mapping.ignore_malformed": true 
    },
    "mappings" : {
      "movie" : {
        "dynamic" : "false",
        "properties" : {
          "title" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "release_date" : {
            "type" : "date",
            "format" : "YYYY-MM-dd"
          },
          "genres" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "plot" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "directors" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "actors" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
             }
           },
           "rating" : {
             "type" : "float"
              },
           "year" : {
             "type" : "integer"
           },
            "rank" : {
             "type" : "integer"
           },
           "running_time_secs" : {
             "type" : "integer"
           }, 
          "image_url" : {
            "type" : "text",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
           }
          }
        }
      }
    }
'''

### Create elasticsearch connection

In [None]:
# Local Elasticsearch
# Use this or else AWS blocks below
# connects to port 9300 by default
es = Elasticsearch()

In [9]:
# AWS Elasticsearch
# Don't run this if local version is being used
# Authorization
# If we have an open policy on AWS ES this block is not needed and http_auth must be commented out below
region = 'us-west-2' 
service = 'es'
#Credentials calls the user keys identified from aws-cli command aws configure
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)

#Create ES Connection with AWS
#AWS ES endpoint link
host = 'put your aws elasticsearch endpoint here'

es = Elasticsearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)
print(es.info())

{'name': 'MJtp04b', 'cluster_name': '634195893235:movies', 'cluster_uuid': 'ymqDfLcWQkGDXG-otApe-A', 'version': {'number': '6.3.1', 'build_flavor': 'oss', 'build_type': 'zip', 'build_hash': 'eb782d0', 'build_date': '2018-09-11T14:05:25.216906Z', 'build_snapshot': False, 'lucene_version': '7.3.1', 'minimum_wire_compatibility_version': '5.6.0', 'minimum_index_compatibility_version': '5.0.0'}, 'tagline': 'You Know, for Search'}


## Check cluster and index 

In [10]:
#Check Cluster Health
es.cluster.health()

{'cluster_name': '634195893235:movies',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1,
 'active_primary_shards': 31,
 'active_shards': 31,
 'relocating_shards': 0,
 'initializing_shards': 0,
 'unassigned_shards': 30,
 'delayed_unassigned_shards': 0,
 'number_of_pending_tasks': 0,
 'number_of_in_flight_fetch': 0,
 'task_max_waiting_in_queue_millis': 0,
 'active_shards_percent_as_number': 50.81967213114754}

In [11]:
# Index Info
es.cat.indices()

'yellow open df1     f9yaHDIhSpqrK1a_GGX6pQ 5 1 1000 0  1.2mb  1.2mb\nyellow open df4     jnSJXJxJQKivv9eiivS9Tg 5 1 1000 0  1.3mb  1.3mb\nyellow open df3     hMg5Tg_5T8mTEL0nhq7DEg 5 1 1000 0  1.2mb  1.2mb\nyellow open df2     s29bB2JnQOi37AdW5HDbmg 5 1 1000 0  1.2mb  1.2mb\nyellow open df5     FEr4PYfsR1G8lUYxGmwU9A 5 1 1000 0  1.4mb  1.4mb\ngreen  open .kibana WbySMBDlS5qZ8rzIJPm96A 1 0    6 0 33.2kb 33.2kb\nyellow open movies  LDYQOMNNQhOgpiJ2hsN6NQ 5 1 5000 0  5.4mb  5.4mb\n'

In [15]:
# Delete index if exists
for i in range(1,6):
    if es.indices.exists('df{}'.format(i)):
        es.indices.delete(index='df{}'.format(i))

In [16]:
# Index Info
es.cat.indices()

'green  open .kibana WbySMBDlS5qZ8rzIJPm96A 1 0    4 0 21.6kb 21.6kb\nyellow open movies  LDYQOMNNQhOgpiJ2hsN6NQ 5 1 5000 0  5.4mb  5.4mb\n'

In [17]:
# Create new index w/ mapping
for i in range(1,6):
    es.indices.create(index='df{}'.format(i), ignore=400, body=mapping)

In [18]:
# Index Info
es.cat.indices()

'yellow open df1     f9yaHDIhSpqrK1a_GGX6pQ 5 1    0 0   401b   401b\nyellow open df4     jnSJXJxJQKivv9eiivS9Tg 5 1    0 0   631b   631b\nyellow open df3     hMg5Tg_5T8mTEL0nhq7DEg 5 1    0 0   460b   460b\nyellow open df2     s29bB2JnQOi37AdW5HDbmg 5 1    0 0   460b   460b\nyellow open df5     FEr4PYfsR1G8lUYxGmwU9A 5 1    0 0   460b   460b\ngreen  open .kibana WbySMBDlS5qZ8rzIJPm96A 1 0    4 0 21.6kb 21.6kb\nyellow open movies  LDYQOMNNQhOgpiJ2hsN6NQ 5 1 5000 0  5.4mb  5.4mb\n'

In [19]:
# build bulk index files
for j in tqdm_notebook(range(1,6)):
    for i in tqdm_notebook(range(len(df_list[j-1]))):
        try:
            actions = [
                {
                  "_index" : 'df{}'.format(j),
                  "_type" : "movie",
                  "_id" : i,
                  "_source" : df_list[j-1].iloc[i].to_json(orient="index")
                }]
            helpers.bulk(es, actions, raise_on_exception=False, request_timeout=30)  
        except:
            pass

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [22]:
# Index Info
es.cat.indices()

'yellow open df1     f9yaHDIhSpqrK1a_GGX6pQ 5 1 1000 0  1.2mb  1.2mb\nyellow open df4     jnSJXJxJQKivv9eiivS9Tg 5 1 1000 0  1.3mb  1.3mb\nyellow open df3     hMg5Tg_5T8mTEL0nhq7DEg 5 1 1000 0  1.2mb  1.2mb\nyellow open df2     s29bB2JnQOi37AdW5HDbmg 5 1 1000 0  1.2mb  1.2mb\nyellow open df5     FEr4PYfsR1G8lUYxGmwU9A 5 1 1000 0  1.4mb  1.4mb\ngreen  open .kibana WbySMBDlS5qZ8rzIJPm96A 1 0    5 0 27.6kb 27.6kb\nyellow open movies  LDYQOMNNQhOgpiJ2hsN6NQ 5 1 5000 0  5.4mb  5.4mb\n'