# Example to Read / Write to Elasticsearch with Spark

Documentation: https://www.elastic.co/guide/en/elasticsearch/hadoop/current/spark.html#spark-python


In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
# MINIO CONFIGURATION
elastic_host = "elasticsearch"
elastic_port = "9200"

In [4]:
# Spark init
spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
    .config("spark.jars.packages","org.elasticsearch:elasticsearch-spark-20_2.12:7.15.0")\
    .config("spark.es.nodes", elastic_host) \
    .config("spark.es.port",elastic_port) \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.elasticsearch#elasticsearch-spark-20_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a237a115-0674-4085-b5ab-e00fcda303af;1.0
	confs: [default]


:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.elasticsearch#elasticsearch-spark-20_2.12;7.15.0 in central
	found org.scala-lang#scala-reflect;2.12.8 in central
	found org.slf4j#slf4j-api;1.7.6 in central
	found commons-logging#commons-logging;1.1.1 in central
	found javax.xml.bind#jaxb-api;2.3.1 in central
	found com.google.protobuf#protobuf-java;2.5.0 in central
	found org.apache.spark#spark-yarn_2.12;2.4.4 in central
downloading https://repo1.maven.org/maven2/org/elasticsearch/elasticsearch-spark-20_2.12/7.15.0/elasticsearch-spark-20_2.12-7.15.0.jar ...
	[SUCCESSFUL ] org.elasticsearch#elasticsearch-spark-20_2.12;7.15.0!elasticsearch-spark-20_2.12.jar (117ms)
:: resolution report :: resolve 4135ms :: artifacts dl 118ms
	:: modules in use:
	com.google.protobuf#protobuf-java;2.5.0 from central in [default]
	commons-logging#commons-logging;1.1.1 from central in [default]
	javax.xml.bind#jaxb-api;2.3.1 from central in [default]
	org.apache.spark#spark-yarn_2.12;2.4.4 from central in [default]
	org.elasticsearch#elasticsea

In [5]:
# read local data
df = spark.read.option("multiline","true").json("/home/jovyan/datasets/json-samples/stocks.json")
df.toPandas()

Unnamed: 0,price,symbol
0,126.82,AAPL
1,3098.12,AMZN
2,251.11,FB
3,1725.05,GOOG
4,128.39,IBM
5,212.55,MSFT
6,78.0,NET
7,497.0,NFLX
8,823.8,TSLA
9,45.11,TWTR


In [6]:
# Write to Elastic Under index stocks with default type (_doc)
df.write.mode("Overwrite").format("es").save("stocks/_doc")

In [9]:
# read back from Elasticsearch
df1 = spark.read.format("es").load("stocks/_doc")
df1.toPandas()

Unnamed: 0,price,symbol
0,126.82,AAPL
1,3098.120117,AMZN
2,251.110001,FB
3,1725.050049,GOOG
4,128.389999,IBM
5,212.550003,MSFT
6,78.0,NET
7,497.0,NFLX
8,823.799988,TSLA
9,45.110001,TWTR
