In [1]:
import sys
import yaml
import time
import argparse
import subprocess
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, LongType, StringType, ArrayType

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

%reload_ext autoreload
%autoreload 2

In [2]:
def load_config(path):
    params = dict()
    with open(path, 'r') as stream:
        params = yaml.load(stream, Loader=yaml.FullLoader)
    return params

In [3]:
params = load_config("es_dev.yaml")
params

{'movies_path': 's3://dmetasoul-bucket/demo/movielens/ml-1m/movies.dat',
 'ratings_path': 's3://dmetasoul-bucket/demo/movielens/ml-1m/ratings.dat',
 'users_path': 's3://dmetasoul-bucket/demo/movielens/ml-1m/users.dat',
 'imdb_path': 's3://dmetasoul-bucket/demo/movielens/ml-25m/movie_ml_imdb.csv',
 'douban_movies_path': 's3://dmetasoul-bucket/demo/datasets/moviedata-10m/movies.csv',
 'es_host': 'elastic-demo-es-http.default.svc.cluster.local',
 'es_port': 9200,
 'create_movies_index': 'movies',
 'create_movies_mapping_id': 'movie_id',
 'create_movies': {'mappings': {'properties': {'movie_id': {'type': 'integer'},
    'title': {'type': 'text'},
    'genres': {'type': 'keyword'}}}},
 'create_users_index': 'users',
 'create_users': {'mappings': {'properties': {'user_id': {'type': 'integer'}}}},
 'create_ratings_index': 'ratings',
 'create_ratings': {'mappings': {'properties': {'timestamp': {'type': 'date'},
    'user_id': {'type': 'integer'},
    'movie_id': {'type': 'integer'},
    'ratin

In [4]:
USER = "${USER"
PASS = "${PASSWORD}"

In [5]:
def init_spark():
    spark = (SparkSession.builder
        .appName('Elastic Search')
        .master('local')
        .config("spark.executor.memory","4G")
        .config("spark.executor.instances","2")
        .config("spark.network.timeout","500")
        .config("spark.executor.memoryOverhead", "2G")
        .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.2.3")
        .config("spark.jars.repositories", "https://maven.aliyun.com/repository/central")
        .config("spark.es.net.http.auth.user", USER)
        .config("spark.es.net.http.auth.pass", PASS)
        .config("spark.es.port", params['es_port'])
        .config("spark.es.nodes", params['es_host'])
        .config("spark.es.nodes.wan.only","true")\
        .config("spark.es.index.auto.create","true") \
        .config("spark.es.net.ssl", "false") \
        .getOrCreate())
    
    sc = spark.sparkContext
    print('Debug -- spark init')
    print('Debug -- version:', sc.version)   
    print('Debug -- applicaitonId:', sc.applicationId)
    print('Debug -- uiWebUrl:', sc.uiWebUrl)
    return spark

def stop_spark(spark):
    print('Debug -- spark stop')
    spark.sparkContext.stop()

def read_dataset(douban_movies_path, **kwargs):
    movies = spark.read.csv(douban_movies_path, sep=',',inferSchema=True, header=True)
    return movies

In [6]:
spark = init_spark()

https://maven.aliyun.com/repository/central added as a remote repository with the name: repo-1
Ivy Default Cache set to: /home/spark/.ivy2/cache
The jars for the packages stored in: /home/spark/.ivy2/jars
org.elasticsearch#elasticsearch-spark-30_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-357ab1c5-5693-4e7c-9ec5-30b6bca0306a;1.0
	confs: [default]


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.elasticsearch#elasticsearch-spark-30_2.12;8.2.3 in central
	found org.scala-lang#scala-reflect;2.12.8 in central
	found org.slf4j#slf4j-api;1.7.6 in central
	found commons-logging#commons-logging;1.1.1 in central
	found javax.xml.bind#jaxb-api;2.3.1 in central
	found com.google.protobuf#protobuf-java;2.5.0 in central
	found org.apache.spark#spark-yarn_2.12;3.2.1 in central
:: resolution report :: resolve 230ms :: artifacts dl 2ms
	:: modules in use:
	com.google.protobuf#protobuf-java;2.5.0 from central in [default]
	commons-logging#commons-logging;1.1.1 from central in [default]
	javax.xml.bind#jaxb-api;2.3.1 from central in [default]
	org.apache.spark#spark-yarn_2.12;3.2.1 from central in [default]
	org.elasticsearch#elasticsearch-spark-30_2.12;8.2.3 from central in [default]
	org.scala-lang#scala-reflect;2.12.8 from central in [default]
	org.slf4j#slf4j-api;1.7.6 from central in [default]
	---------------------------------------------------------------------
	|            

Debug -- spark init
Debug -- version: 3.1.2
Debug -- applicaitonId: local-1655732884425
Debug -- uiWebUrl: http://jupyter.my.nginx.test/hub/user-redirect/proxy/4040/jobs/


In [7]:
movies = read_dataset(**params)

                                                                                

In [8]:
movies.limit(10).toPandas()

Unnamed: 0,MOVIE_ID,NAME,ALIAS,ACTORS,COVER,DIRECTORS,DOUBAN_SCORE,DOUBAN_VOTES,GENRES,IMDB_ID,LANGUAGES,MINS,OFFICIAL_SITE,REGIONS,RELEASE_DATE,SLUG,STORYLINE,TAGS,YEAR,ACTOR_IDS,DIRECTOR_IDS
0,26670818,情定河州,情定临夏天使然,王博/吴佳尼/王姬/高丽雯/郭力行/尹哲/沈丹萍/罗中旭/臧金生/罗刚/居文沛/阎青妤,,尹哲,0.0,0.0,剧情/爱情,,汉语普通话,0.0,,中国大陆,,RmNQQeyzb,电影《情定临夏天使然》讲述临夏新一代青年人发奋图强、借助国家一带一路战略励志创业的故事。,甘肃/临夏/伊斯兰/中国/2016/中国大陆/烂片/宣传伊斯兰教的电影,2049,王博:|吴佳尼:1313262|王姬:1275275|高丽雯:1325661|郭力行:135...,尹哲:1326188
1,25815002,我不是李小龙,,谷尚蔚/吴孟达/曾志伟/杜海涛,,洪金宝,0.0,0.0,动作/爱情,,汉语普通话,0.0,,中国大陆,,EZnVfiNYf,桀骜不驯的如龙武功高强，在一场比赛中，被打成重伤，被诊今生不能再用功夫。女友荆兰为激发他重新...,穿越/华语,2049,谷尚蔚:1330813|吴孟达:1016771|曾志伟:1002862|杜海涛:1313024,洪金宝:1055887
2,26392287,曼哈顿中国女孩,,,,,0.0,0.0,剧情,,英语 / 汉语普通话,0.0,,中国大陆 / 美国,,NuUvEJnzb,平民女孩李莉只身初入曼哈顿求学，在历经迷失与困惑之后，凭借努力与善良收获了事业上的成功，同时...,,2049,,
3,26695995,绿毛水怪,,,,梁栋/吴国墉,0.0,0.0,爱情,,汉语普通话,0.0,,中国大陆,,rqaqyb6ea,王小波经典中篇小说《绿毛水怪》将改编电影。《绿毛水怪》是王小波早期手稿作品，以天马行空的想象...,小波/王小波/爱情/小说改编/文学改编/剧情/中国/2017,2049,,梁栋:|吴国墉:
4,26392292,为了祖国,,,,,0.0,0.0,剧情/历史,,汉语普通话 / 韩语 / 日语,0.0,,中国大陆 / 韩国,,Inrqy7uib,1932年上海虹口爆炸案后，韩国国父金九在褚辅成、朱爱宝这些普通群众的帮助下逃到嘉兴避难。金...,,2049,,
5,6389523,天地无用,,禅婵/梁宸/张凡夕 François Chang,,张凡夕,0.0,0.0,剧情/爱情,,汉语普通话 / 法语,0.0,,中国大陆 / 法国,,7IRuUJZRe,张凡夕筹划中电影长片。,张凡夕/独立电影/人性/独立/上映,2046,禅婵:|梁宸:1314565|张凡夕 François Chang:,张凡夕:1275255
6,26266621,看不见的脸,見えない貌,,,,0.0,0.0,悬疑,,汉语普通话,0.0,,中国大陆,,AIvUfUfib,日本推理作家东野圭吾的小说在其本国不断被搬上大银幕，如今，中国电影公司也在推理小说领域跃跃欲...,悬疑/漫画小说真人版/日本/推理/日本电影/小说改编,2045,,
7,26647079,你无法抵达的时间,,,,,0.0,0.0,科幻,,汉语普通话,0.0,,中国大陆,,e62r6IBea,科幻作家潘海天创办的上海竺灿文化有限公司先后开始了三个项目，潘海天本人的《王二大爷的战争》，...,科幻/夏笳/中国,2035,,
8,26277337,黑夜之神,包公：黑夜之神,,,,0.0,0.0,动作/悬疑/古装,,汉语普通话,0.0,,中国大陆 / 中国香港,,RAYJBmrqa,曾出品《西游记之大闹天宫》的星皓影业有限公司推出了《包公：黑夜之神》讲述了包拯率一众江湖英雄...,,2035,,
9,26378809,传送点,,,,辛成江,0.0,0.0,科幻,,汉语普通话,0.0,,中国大陆,,Be6IRuMmb,在艺恩汇第二期电影创投沙龙上，还展示了四个电影项目，导演辛成江的《传送点》，讲述了寻宝游戏中...,,2035,,辛成江:1349576


In [9]:
es_uri = "http://%s:%s"%(params['es_host'], params['es_port'])
es_uri

'http://elastic-demo-es-http.default.svc.cluster.local:9200'

In [10]:
!curl --user elastic:59Jazz5tf0l8e935xHEt1K8D \
     http://elastic-demo-es-http.default.svc.cluster.local:9200/_analyze -H "Content-Type:application/json" -d '\
      {\
          "text":"李小龙",\
          "analyzer":"smartcn"\
      }'

{"tokens":[{"token":"李","start_offset":0,"end_offset":1,"type":"word","position":0},{"token":"小","start_offset":1,"end_offset":2,"type":"word","position":1},{"token":"龙","start_offset":2,"end_offset":3,"type":"word","position":2}]}

In [11]:
from elasticsearch_utils import *
es = create_es_using_http_auth(es_uri, USER, PASS, **params)
es

<Elasticsearch(['http://elastic-demo-es-http.default.svc.cluster.local:9200'])>

In [12]:
index_name = params['create_douban_movies_index']
index_schema = params['create_douban_movies']
mapping_id = params['create_douban_movies_mapping_id']

In [13]:
delete_index(es, index_name)

<Elasticsearch(['http://elastic-demo-es-http.default.svc.cluster.local:9200'])>

In [14]:
create_index(es, index_name, index_schema)

<Elasticsearch(['http://elastic-demo-es-http.default.svc.cluster.local:9200'])>

In [15]:
movies = movies.select('MOVIE_ID', 'NAME', 'STORYLINE')
save_df_to_es_index(es, movies, index_name, mapping_id)

                                                                                

<Elasticsearch(['http://elastic-demo-es-http.default.svc.cluster.local:9200'])>

In [16]:
!curl --user elastic:59Jazz5tf0l8e935xHEt1K8D \
     http://elastic-demo-es-http.default.svc.cluster.local:9200/douban_movies/_search -H "Content-Type:application/json" -d '\
      {\
        "query": {\
            "ids": {\
                "values": ["26392287"]\
            }\
        }\
      }'

{"took":1,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":1,"relation":"eq"},"max_score":1.0,"hits":[{"_index":"douban_movies","_id":"26392287","_score":1.0,"_source":{"MOVIE_ID":26392287,"NAME":"曼哈顿中国女孩","STORYLINE":"平民女孩李莉只身初入曼哈顿求学，在历经迷失与困惑之后，凭借努力与善良收获了事业上的成功，同时帮助在美华侨保卫中国城、获得爱情，完成了生命中一次华丽的蜕变。"}}]}}

In [17]:
matchers = [("NAME", "李小龙"), ("STORYLINE", "功夫")]
sorters =[("MOVIE_ID", "desc")]

match_rules = generate_keyword_match_rules(matchers)
sorter_rules = generate_attribute_sorter_rules(sorters)

print(match_rules)
print(sorter_rules)

[{'match_phrase': {'NAME': '李小龙'}}, {'match_phrase': {'STORYLINE': '功夫'}}]
[{'MOVIE_ID': {'order': 'desc'}}]


In [18]:
result = search_es_using_query_combination(es, 
                                           index_name, 
                                           must_rules=match_rules, 
                                           sorter_rules=sorter_rules,
                                           from_no=0,
                                           size=20)

In [19]:
parse_es_search_result(result)

[{'MOVIE_ID': 25815002,
  'NAME': '我不是李小龙',
  'STORYLINE': '桀骜不驯的如龙武功高强，在一场比赛中，被打成重伤，被诊今生不能再用功夫。女友荆兰为激发他重新振作，按照他的原型制造了一个机器人，如龙能否恢复武功，他和机器人到底哪个功夫高强，又能否在机器人中迷失自己？'},
 {'MOVIE_ID': 4930409,
  'NAME': '李小龙外传 - 电影',
  'STORYLINE': '一位李小龙崇拜者也名叫李小龙的青年，在同样是李小龙崇拜者的父亲的影响下，努力学习李小龙功夫，怀揣功夫梦、影视明星梦，在奔赴一个剧组面试主角的过程中，经历了剧组设置的个个惊险障碍，以精湛的李小龙功夫战胜光头帮，战胜面试对手，成为准男一号演员....'},
 {'MOVIE_ID': 1304134,
  'NAME': '李小龙与我',
  'STORYLINE': '丁佩以自己的角度，把功夫影帝李小龙生前一切的第一手事实，毫无保留地公诸于世，包括她如何认识李小龙，及重逢後如何发展成可以倾诉肺腑之言的红颜知己。究竟李小龙与丁佩在私生活中，是缠绵的爱情、抑或纯洁的友谊，在银幕上自有说明。而全片最珍贵的部分，当然是对李小龙死因的交代。'}]

In [20]:
ids=[26670818, 6389523, 26392287]
id_rules = generte_id_rules(ids)
id_rules

{'ids': {'values': ['26670818', '6389523', '26392287']}}

In [21]:
result = search_es_using_id_filtering(es, 
                                      index_name, 
                                      id_rules=id_rules, 
                                      sorter_rules=sorter_rules,
                                      from_no=0,
                                      size=20)

In [22]:
parse_es_search_result(result)

[{'MOVIE_ID': 26670818,
  'NAME': '情定河州',
  'STORYLINE': '电影《情定临夏天使然》讲述临夏新一代青年人发奋图强、借助国家一带一路战略励志创业的故事。'},
 {'MOVIE_ID': 26392287,
  'NAME': '曼哈顿中国女孩',
  'STORYLINE': '平民女孩李莉只身初入曼哈顿求学，在历经迷失与困惑之后，凭借努力与善良收获了事业上的成功，同时帮助在美华侨保卫中国城、获得爱情，完成了生命中一次华丽的蜕变。'},
 {'MOVIE_ID': 6389523, 'NAME': '天地无用', 'STORYLINE': '张凡夕筹划中电影长片。'}]

In [23]:
spark.stop()