In [1]:
import sys
import yaml
import time
import argparse
import subprocess
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, LongType, StringType, ArrayType

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

%reload_ext autoreload
%autoreload 2

In [2]:
def load_config(path):
    params = dict()
    with open(path, 'r') as stream:
        params = yaml.load(stream, Loader=yaml.FullLoader)
    return params

In [3]:
params = load_config("es_dev.yaml")

In [4]:
params

{'movies_path': 's3://dmetasoul-bucket/demo/movielens/ml-1m/movies.dat',
 'ratings_path': 's3://dmetasoul-bucket/demo/movielens/ml-1m/ratings.dat',
 'users_path': 's3://dmetasoul-bucket/demo/movielens/ml-1m/users.dat',
 'imdb_path': 's3://dmetasoul-bucket/demo/movielens/ml-25m/movie_ml_imdb.csv',
 'douban_movies_path': 's3://dmetasoul-bucket/demo/datasets/moviedata-10m/movies.csv',
 'es_host': 'elastic-demo-es-http.default.svc.cluster.local',
 'es_port': 9200,
 'create_movies_index': 'movies',
 'create_movies_mapping_id': 'movie_id',
 'create_movies': {'mappings': {'properties': {'movie_id': {'type': 'integer'},
    'title': {'type': 'text'},
    'genres': {'type': 'keyword'}}}},
 'create_users_index': 'users',
 'create_users': {'mappings': {'properties': {'user_id': {'type': 'integer'}}}},
 'create_ratings_index': 'ratings',
 'create_ratings': {'mappings': {'properties': {'timestamp': {'type': 'date'},
    'user_id': {'type': 'integer'},
    'movie_id': {'type': 'integer'},
    'ratin

In [5]:
USER = "elastic"
PASS = "59Jazz5tf0l8e935xHEt1K8D"

In [6]:
def init_spark():
    spark = (SparkSession.builder
        .appName('Elastic Search')
        .master('local')
        .config("spark.executor.memory","4G")
        .config("spark.executor.instances","2")
        .config("spark.network.timeout","500")
        .config("spark.executor.memoryOverhead", "2G")
        .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.2.3")
        .config("spark.jars.repositories", "https://maven.aliyun.com/repository/central")
        .config("spark.es.net.http.auth.user", USER)
        .config("spark.es.net.http.auth.pass", PASS)
        .config("spark.es.port", params['es_port'])
        .config("spark.es.nodes", params['es_host'])
        .config("spark.es.nodes.wan.only","true")\
        .config("spark.es.index.auto.create","true") \
        .config("spark.es.net.ssl", "false") \
        .getOrCreate())
    
    sc = spark.sparkContext
    print('Debug -- spark init')
    print('Debug -- version:', sc.version)   
    print('Debug -- applicaitonId:', sc.applicationId)
    print('Debug -- uiWebUrl:', sc.uiWebUrl)
    return spark

def stop_spark(spark):
    print('Debug -- spark stop')
    spark.sparkContext.stop()

def read_dataset(movies_path, ratings_path, users_path, imdb_path, **kwargs):
    ### read movies
    movies_schema = StructType([
            StructField("movie_id", LongType(), True),
            StructField("title", StringType(), True),
            StructField("genre", StringType(), True)
    ])

    movies = spark.read.csv(movies_path, sep='::',inferSchema=False, header=False, schema=movies_schema)

    ### read ratings
    ratings_schema = StructType([
            StructField("user_id", LongType(), True),
            StructField("movie_id", LongType(), True),
            StructField("rating", FloatType(), True),
            StructField("timestamp", LongType(), True)
    ])

    ratings = spark.read.csv(ratings_path, sep='::', inferSchema=False, header=False, schema=ratings_schema)
    
    ### read users
    users_schema = StructType([
            StructField("user_id", LongType(), True),
            StructField("gender", StringType(), True),
            StructField("age", IntegerType(), True),
            StructField("occupation", StringType(), True),
            StructField("zip", StringType(), True)
    ])

    users = spark.read.csv(users_path, sep='::', inferSchema=False, header=False, schema=users_schema)

    ### read imdb datasets
    imdb = spark.read.csv(imdb_path, sep=r'\t', inferSchema=False, header=True)
    imdb = imdb.withColumn('imdb_url', F.concat(F.lit("https://www.imdb.com/title/"), F.col("tconst"), F.lit("/")))

    return users, movies, ratings, imdb

In [7]:
spark = init_spark()

https://maven.aliyun.com/repository/central added as a remote repository with the name: repo-1


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/spark/.ivy2/cache
The jars for the packages stored in: /home/spark/.ivy2/jars
org.elasticsearch#elasticsearch-spark-30_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-844fd020-da32-46c8-8b55-e16b709d5052;1.0
	confs: [default]
	found org.elasticsearch#elasticsearch-spark-30_2.12;8.2.3 in central
	found org.scala-lang#scala-reflect;2.12.8 in central
	found org.slf4j#slf4j-api;1.7.6 in central
	found commons-logging#commons-logging;1.1.1 in central
	found javax.xml.bind#jaxb-api;2.3.1 in central
	found com.google.protobuf#protobuf-java;2.5.0 in central
	found org.apache.spark#spark-yarn_2.12;3.2.1 in central
:: resolution report :: resolve 229ms :: artifacts dl 2ms
	:: modules in use:
	com.google.protobuf#protobuf-java;2.5.0 from central in [default]
	commons-logging#commons-logging;1.1.1 from central in [default]
	javax.xml.bind#jaxb-api;2.3.1 from central in [default]
	org.apache.spark#spark-yarn_2.12;3.2.1 fro

Debug -- spark init
Debug -- version: 3.1.2
Debug -- applicaitonId: local-1655733348528
Debug -- uiWebUrl: http://jupyter.my.nginx.test/hub/user-redirect/proxy/4040/jobs/


In [8]:
users, movies, ratings, imdb = read_dataset(**params)

                                                                                

In [9]:
users.limit(10).toPandas()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,6810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


In [10]:
movies.limit(10).toPandas()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [11]:
ratings.limit(10).toPandas()

                                                                                

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5.0,978300760
1,1,661,3.0,978302109
2,1,914,3.0,978301968
3,1,3408,4.0,978300275
4,1,2355,5.0,978824291
5,1,1197,3.0,978302268
6,1,1287,5.0,978302039
7,1,2804,5.0,978300719
8,1,594,4.0,978302268
9,1,919,4.0,978301368


In [12]:
imdb.limit(10).toPandas()

22/06/20 13:55:56 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Unnamed: 0,movieId,title,genres_ml,year,tconst,writer,actor,director,producer,actress,cinematographer,composer,editor,production_designer,archive_footage,archive_sound,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_imdb,directors,writers,imdb_url
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,tt0114709,nm0169505|nm0230032|nm0004056|nm0710020|nm0923736,nm0000158|nm0000741|nm0725543|nm0001815,nm0005124,\N,\N,\N,\N,\N,\N,\N,\N,movie,Toy Story,Toy Story,0,1995,\N,81,Adventure|Animation|Comedy,nm0005124,nm0005124|nm0230032|nm0004056|nm0710020|nm0923...,https://www.imdb.com/title/tt0114709/
1,2,Jumanji,Adventure|Children|Fantasy,1995,tt0113497,nm0378144|nm0852430|nm0833164|nm0885575,nm0000245|nm0404993,nm0002653,nm0472256,nm0000379|nm0001372,\N,\N,\N,\N,\N,\N,movie,Jumanji,Jumanji,0,1995,\N,104,Adventure|Comedy|Family,nm0002653,nm0378144|nm0852430|nm0833164|nm0885575,https://www.imdb.com/title/tt0113497/
2,3,Grumpier Old Men,Comedy|Romance,1995,tt0113228,nm0425756,nm0000527|nm0000493,nm0222043,nm0075828|nm0204862,nm0000268|nm0000047,nm0005714,nm0006293,\N,\N,\N,\N,movie,Grumpier Old Men,Grumpier Old Men,0,1995,\N,101,Comedy|Romance,nm0222043,nm0425756,https://www.imdb.com/title/tt0113228/
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995,tt0114885,nm0573334|nm0060103,\N,nm0001845,nm0771834|nm0842470,nm0001365|nm0000291|nm0222643|nm0005375,\N,nm0004892,\N,\N,\N,\N,movie,Waiting to Exhale,Waiting to Exhale,0,1995,\N,124,Comedy|Drama|Romance,nm0001845,nm0573334|nm0060103,https://www.imdb.com/title/tt0114885/
4,5,Father of the Bride Part II,Comedy,1995,tt0113041,nm0352443|nm0329304|nm0583600,nm0000188|nm0001737,nm0796124,\N,nm0000473|nm0931090,nm0204567,nm0006293,\N,\N,\N,\N,movie,Father of the Bride Part II,Father of the Bride Part II,0,1995,\N,106,Comedy|Family|Romance,nm0796124,nm0352443|nm0329304|nm0583600|nm0796124,https://www.imdb.com/title/tt0113041/
5,6,Heat,Action|Crime|Thriller,1995,tt0113277,\N,nm0000199|nm0000134|nm0000174|nm0000685,nm0000520,nm0513165,\N,nm0005883,nm0006106,nm0325549|nm0117779,\N,\N,\N,movie,Heat,Heat,0,1995,\N,170,Crime|Drama|Thriller,nm0000520,nm0000520,https://www.imdb.com/title/tt0113277/
6,7,Sabrina,Comedy|Romance,1995,tt0114319,nm0713128|nm0853138|nm0000697|nm0499626|nm0070660,nm0000148|nm0001427,nm0001628,\N,nm0000566|nm0545408,\N,\N,\N,\N,\N,\N,movie,Sabrina,Sabrina,0,1995,\N,127,Comedy|Drama|Romance,nm0001628,nm0853138|nm0000697|nm0499626|nm0070660|nm0713128,https://www.imdb.com/title/tt0114319/
7,8,Tom and Huck,Adventure|Children,1995,tt0112302,nm0878494|nm0814085|nm0521739,nm0001795|nm0000605|nm0734236|nm0777760,nm0382072,nm0548257|nm0049689,\N,\N,\N,\N,\N,\N,\N,movie,Tom and Huck,Tom and Huck,0,1995,\N,97,Adventure|Comedy|Drama,nm0382072,nm0878494|nm0814085|nm0521739,https://www.imdb.com/title/tt0112302/
8,9,Sudden Death,Action,1995,tt0114576,nm0049945|nm0704164,nm0000241|nm0000959|nm0000855,nm0001382,nm0049920|nm0224537,nm0942925,\N,nm0002201,\N,\N,\N,\N,movie,Sudden Death,Sudden Death,0,1995,\N,111,Action|Crime|Thriller,nm0001382,nm0049945|nm0704164,https://www.imdb.com/title/tt0114576/
9,10,GoldenEye,Action|Adventure|Thriller,1995,tt0113189,nm0001220|nm0289833|nm0128997|nm0270761,nm0000112|nm0000293,nm0132709,nm0110483,nm0001713|nm0000463,\N,\N,\N,\N,\N,\N,movie,GoldenEye,GoldenEye,0,1995,\N,130,Action|Adventure|Thriller,nm0132709,nm0001220|nm0289833|nm0128997|nm0270761,https://www.imdb.com/title/tt0113189/


In [13]:
from pyspark.sql.functions import udf

extract_genres = udf(lambda x: x.lower().split("|"), ArrayType(StringType()))
raw_movies = movies
movies = raw_movies.select("movie_id", "title", extract_genres("genre").alias("genre"))
movies.limit(10).toPandas()

                                                                                

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),"[animation, children's, comedy]"
1,2,Jumanji (1995),"[adventure, children's, fantasy]"
2,3,Grumpier Old Men (1995),"[comedy, romance]"
3,4,Waiting to Exhale (1995),"[comedy, drama]"
4,5,Father of the Bride Part II (1995),[comedy]
5,6,Heat (1995),"[action, crime, thriller]"
6,7,Sabrina (1995),"[comedy, romance]"
7,8,Tom and Huck (1995),"[adventure, children's]"
8,9,Sudden Death (1995),[action]
9,10,GoldenEye (1995),"[action, adventure, thriller]"


In [14]:
from elasticsearch import Elasticsearch
from elasticsearch_utils import *

es_uri = "http://%s:%s"%(params['es_host'], params['es_port'])
es = create_es_using_http_auth(es_uri, USER, PASS, **params)
es.info(pretty=True)

ObjectApiResponse({'name': 'elastic-demo-es-default-0', 'cluster_name': 'elastic-demo', 'cluster_uuid': 'vNY2caRURuyaaARyQV9QyQ', 'version': {'number': '8.2.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '9905bfb62a3f0b044948376b4f607f70a8a151b4', 'build_date': '2022-06-08T22:21:36.455508792Z', 'build_snapshot': False, 'lucene_version': '9.1.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [15]:
es.indices.delete(index="ratings,users,movies")

ObjectApiResponse({'acknowledged': True})

In [16]:
create_ratings = {
    # this mapping definition sets up the fields for the rating events
    "mappings": {
        "properties": {
            "timestamp": {
                "type": "date"
            },
            "user_id": {
                "type": "integer"
            },
            "movie_id": {
                "type": "integer"
            },
            "rating": {
                "type": "double"
            }
        }  
    }
}

create_users = {
    "mappings": {
        "properties": {
            "user_id": {
                "type": "integer"
            }
        }
    }
}

create_movies = {
    "mappings": {
        "properties": {
            "movie_id": {
                "type": "integer"
            },
            "title": {
                "type": "text"
            },
            "genres": {
                "type": "keyword"
            }           
        }
    }
}


In [17]:
res_ratings = es.indices.create(index="ratings", body=create_ratings)
res_users = es.indices.create(index="users", body=create_users)
res_movies = es.indices.create(index="movies", body=create_movies)

In [18]:
print(res_ratings)
print(res_users)
print(res_movies)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ratings'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'users'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'}


In [19]:
# write ratings data
ratings.write\
       .format("org.elasticsearch.spark.sql")\
       .mode("overwrite") \
       .save("ratings")

num_ratings_es = es.count(index="ratings")['count']
num_ratings_df = ratings.count()
# check write went ok
print("Dataframe count: {}".format(num_ratings_df))
print("ES index count:  {}".format(num_ratings_es))

[Stage 7:>                                                          (0 + 1) / 1]

Dataframe count: 1000209
ES index count:  1000209


                                                                                

In [20]:
all_hits = es.search(index="ratings", q="*", size=3)

In [21]:
for hit in all_hits['hits']['hits']:
    print(hit['_source'])

{'user_id': 1069, 'movie_id': 1233, 'rating': 5.0, 'timestamp': 974942770}
{'user_id': 1069, 'movie_id': 1236, 'rating': 5.0, 'timestamp': 974943433}
{'user_id': 1069, 'movie_id': 2977, 'rating': 1.0, 'timestamp': 974945262}


In [22]:
# write movie data, specifying the DataFrame column to use as the id mapping
movies.write\
      .format("org.elasticsearch.spark.sql")\
      .option("es.mapping.id", "movie_id") \
      .mode("overwrite") \
      .save("movies")

num_movies_df = movies.count()
num_movies_es = es.count(index="movies")['count']
# check load went ok
print("Movie DF count: {}".format(num_movies_df))
print("ES index count: {}".format(num_movies_es))

Movie DF count: 3883
ES index count: 3883


In [23]:
all_hits = es.search(index="movies", q="*", size=3)
for hit in all_hits['hits']['hits']:
    print(hit['_source'])

{'movie_id': 1, 'title': 'Toy Story (1995)', 'genre': ['animation', "children's", 'comedy']}
{'movie_id': 2, 'title': 'Jumanji (1995)', 'genre': ['adventure', "children's", 'fantasy']}
{'movie_id': 3, 'title': 'Grumpier Old Men (1995)', 'genre': ['comedy', 'romance']}


In [24]:
all_hits = es.search(index="movies", q='*Jumanji*', size=3)
for hit in all_hits['hits']['hits']:
    print(hit['_source'])

{'movie_id': 2, 'title': 'Jumanji (1995)', 'genre': ['adventure', "children's", 'fantasy']}


In [25]:
!curl --user elastic:59Jazz5tf0l8e935xHEt1K8D \
      http://elastic-demo-es-http.default.svc.cluster.local:9200/movies/_search -H "Content-Type:application/json" \
       -d '{"query" : { "match" : { "genre" : "sci-fi" }}}' | python -m json.tool

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1530  100  1483  100    47   362k  11750 --:--:-- --:--:-- --:--:--  498k
{
    "took": 0,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 276,
            "relation": "eq"
        },
        "max_score": 4.9710717,
        "hits": [
            {
                "_index": "movies",
                "_id": "680",
                "_score": 4.9710717,
                "_source": {
                    "movie_id": 680,
                    "title": "Alphaville (1965)",
                    "genre": [
                        "sci-fi"
                    ]
                }
            },
            {
                "_index": "movies",
                "_id": "1199",
                "_score": 4.9

In [26]:
!curl --user elastic:59Jazz5tf0l8e935xHEt1K8D \
      http://elastic-demo-es-http.default.svc.cluster.local:9200/movies/_search -H "Content-Type:application/json"\
      -d '{"query" : { "match" : { "title" : "movie" }}}' | python -m json.tool

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1691  100  1645  100    46   535k  15333 --:--:-- --:--:-- --:--:--  550k
{
    "took": 0,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 21,
            "relation": "eq"
        },
        "max_score": 5.81522,
        "hits": [
            {
                "_index": "movies",
                "_id": "3007",
                "_score": 5.81522,
                "_source": {
                    "movie_id": 3007,
                    "title": "American Movie (1999)",
                    "genre": [
                        "documentary"
                    ]
                }
            },
            {
                "_index": "movies",
                "_id": "3785",
                "_score

In [27]:
!curl --user elastic:59Jazz5tf0l8e935xHEt1K8D \
      http://elastic-demo-es-http.default.svc.cluster.local:9200/movies/_search -H "Content-Type:application/json" -d '\
      {"query" : {\
          "bool": {\
             "must": [\
                { "match": { "title": "movie" } },\
                { "match": { "genre": "sci-fi" } }\
             ]\
           }}\
       }' | python -m json.tool

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   720  100   518  100   202   168k  67333 --:--:-- --:--:-- --:--:--  234k
{
    "took": 0,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 2,
            "relation": "eq"
        },
        "max_score": 8.07361,
        "hits": [
            {
                "_index": "movies",
                "_id": "671",
                "_score": 8.07361,
                "_source": {
                    "movie_id": 671,
                    "title": "Mystery Science Theater 3000: The Movie (1996)",
                    "genre": [
                        "comedy",
                        "sci-fi"
                    ]
                }
            },
            {
                "_index": "movies",
   

In [28]:
!curl --user elastic:59Jazz5tf0l8e935xHEt1K8D \
      http://elastic-demo-es-http.default.svc.cluster.local:9200/movies/_search -H "Content-Type:application/json" -d '\
      { \
        "from": 0,\
        "size": 3,\
        "query": { "match" : { "genre" : "sci-fi" }} \
      }' | python -m json.tool

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   659  100   550  100   109   134k  27250 --:--:-- --:--:-- --:--:--  160k
{
    "took": 0,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 276,
            "relation": "eq"
        },
        "max_score": 4.9710717,
        "hits": [
            {
                "_index": "movies",
                "_id": "680",
                "_score": 4.9710717,
                "_source": {
                    "movie_id": 680,
                    "title": "Alphaville (1965)",
                    "genre": [
                        "sci-fi"
                    ]
                }
            },
            {
                "_index": "movies",
                "_id": "1199",
                "_score": 4.9

In [29]:
!curl --user elastic:59Jazz5tf0l8e935xHEt1K8D \
      http://elastic-demo-es-http.default.svc.cluster.local:9200/movies/_search -H "Content-Type:application/json" -d '\
      { \
        "from": 0,\
        "size": 3,\
        "query": { "match" : { "genre" : "sci-fi" }} \
      }' | python -m json.tool

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   659  100   550  100   109   179k  36333 --:--:-- --:--:-- --:--:--  214k
{
    "took": 1,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 276,
            "relation": "eq"
        },
        "max_score": 4.9710717,
        "hits": [
            {
                "_index": "movies",
                "_id": "680",
                "_score": 4.9710717,
                "_source": {
                    "movie_id": 680,
                    "title": "Alphaville (1965)",
                    "genre": [
                        "sci-fi"
                    ]
                }
            },
            {
                "_index": "movies",
                "_id": "1199",
                "_score": 4.9

In [30]:
!curl --user elastic:59Jazz5tf0l8e935xHEt1K8D \
      http://elastic-demo-es-http.default.svc.cluster.local:9200/movies/_search -H "Content-Type:application/json" -d '\
      {\
        "from": 0,\
        "size": 3,\
        "query": { "match" : { "genre" : "sci-fi" }}, \
        "sort" : [\
            {"movie_id" : {"order" : "desc"}}\
         ]\
      }' | python -m json.tool

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   775  100   590  100   185   144k  46250 --:--:-- --:--:-- --:--:--  252k
{
    "took": 0,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 276,
            "relation": "eq"
        },
        "max_score": null,
        "hits": [
            {
                "_index": "movies",
                "_id": "3937",
                "_score": null,
                "_source": {
                    "movie_id": 3937,
                    "title": "Runaway (1984)",
                    "genre": [
                        "sci-fi",
                        "thriller"
                    ]
                },
                "sort": [
                    3937
                ]
            },
            {
 

In [31]:
matchers = [("title", "movie"), ("genre", "sci-fi")]
sorters =[("movie_id", "desc")]

match_rules = generate_keyword_match_rules(matchers)
sorter_rules = generate_attribute_sorter_rules(sorters)

print(match_rules)
print(sorter_rules)

[{'match_phrase': {'title': 'movie'}}, {'match_phrase': {'genre': 'sci-fi'}}]
[{'movie_id': {'order': 'desc'}}]


In [32]:
index_name = params['create_movies_index']
index_schema = params['create_movies']

In [33]:
result = search_es_using_query_combination(es, 
                                           index_name, 
                                           must_rules=match_rules, 
                                           sorter_rules=sorter_rules,
                                           from_no=0,
                                           size=3)

In [34]:
parse_es_search_result(result)

[{'movie_id': 1205,
  'title': 'Transformers: The Movie, The (1986)',
  'genre': ['action', 'animation', "children's", 'sci-fi', 'thriller', 'war']},
 {'movie_id': 671,
  'title': 'Mystery Science Theater 3000: The Movie (1996)',
  'genre': ['comedy', 'sci-fi']}]

In [35]:
es.search(
    index="movies",
    query={"bool": {\
             "must": [\
                { "match": { "title": "movie" } },\
                { "match": { "genre": "sci-fi" } }\
             ]\
           }},
    sort=[{"movie_id" : {"order" : "desc"}}],
    size=3,
)

ObjectApiResponse({'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': None, 'hits': [{'_index': 'movies', '_id': '1205', '_score': None, '_source': {'movie_id': 1205, 'title': 'Transformers: The Movie, The (1986)', 'genre': ['action', 'animation', "children's", 'sci-fi', 'thriller', 'war']}, 'sort': [1205]}, {'_index': 'movies', '_id': '671', '_score': None, '_source': {'movie_id': 671, 'title': 'Mystery Science Theater 3000: The Movie (1996)', 'genre': ['comedy', 'sci-fi']}, 'sort': [671]}]}})

In [36]:
spark.stop()