In [7]:
# Download the required jar library for elastic


In [1]:
! sudo curl https://repo1.maven.org/maven2/org/elasticsearch/elasticsearch-spark-20_2.12/7.15.0/elasticsearch-spark-20_2.12-7.15.0.jar --output /usr/local/spark/jars/elasticsearch-spark-20_2.12-7.15.0.jar

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2052k  100 2052k    0     0  6536k      0 --:--:-- --:--:-- --:--:-- 6515k


In [2]:
import pyspark
from pyspark.sql import SparkSession

In [75]:
#   .config("spark.es.nodes.wan.only","true") \
spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
    .config("spark.es.nodes","es01") \
    .config("spark.es.port","9200") \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [76]:
# let's get some file to shove into elastic
df = spark.read.csv("/home/jovyan/datasets/customers/customers.csv", header=True, inferSchema=True)
df.show()

+------+----------+--------------------+------+---------------+-----------+-----+------------+---------------+---------------+
| First|      Last|               Email|Gender|Last IP Address|       City|State|Total Orders|Total Purchased|Months Customer|
+------+----------+--------------------+------+---------------+-----------+-----+------------+---------------+---------------+
|    Al|    Fresco|  afresco@dayrep.com|     M|  74.111.18.161|   Syracuse|   NY|           1|             45|              1|
|  Abby|      Kuss|     akuss@rhyta.com|     F|  23.80.125.101|    Phoenix|   AZ|           1|             25|              2|
| Arial|     Photo|   aphoto@dayrep.com|     F|     24.0.14.56|     Newark|   NJ|           1|            680|              1|
| Bette|     Alott|    balott@rhyta.com|     F| 56.216.127.219|    Raleigh|   NC|           6|            560|             18|
| Barb |    Barion|bbarion@superrito...|     F|   38.68.15.223|     Dallas|   TX|           4|           1590| 

## Geocoding

To use a map in Kibana our data must contain geopoints.We will use a python library to help.

In [77]:
!pip install -q  geocoder

In [78]:
import geocoder
def geoiplookup(ipaddress) ->list :
    geo =  geocoder.ip(ipaddress).latlng
    #return  {'lat' : geo[0], 'lon' : geo[1] }
    return geo

def geoiplookup_lat(ipaddress):
    geo =  geocoder.ip(ipaddress).latlng
    #return  {'lat' : geo[0], 'lon' : geo[1] }
    return geo[0]

def geoiplookup_lon(ipaddress):
    geo =  geocoder.ip(ipaddress).latlng
    #return  {'lat' : geo[0], 'lon' : geo[1] }
    return geo[1]

geoiplookup("8.8.8.8")

[37.4056, -122.0775]

In [79]:
from pyspark.sql.functions import udf 

geoiplookup_lat_udf = udf(geoiplookup_lat)
geoiplookup_lon_udf = udf(geoiplookup_lon)

df = df.withColumn("geolat",geoiplookup_lat_udf("Last IP Address").cast("float"))
df = df.withColumn("geolon",geoiplookup_lon_udf("Last IP Address").cast("float"))
df.printSchema()

root
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Last IP Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Total Orders: integer (nullable = true)
 |-- Total Purchased: integer (nullable = true)
 |-- Months Customer: integer (nullable = true)
 |-- geolat: float (nullable = true)
 |-- geolon: float (nullable = true)



In [80]:
# write the file to the "customers" index with type "_doc"
df.write.format("es").mode("overwrite").save("customers/_doc")

                                                                                

In [81]:
# read the exact same data back out again
df2 = spark.read.format("es").load("customers/_doc")
df2.show()

+-----------+--------------------+------+------+----------+---------------+---------------+-----+------------+---------------+----+-------+---------+
|       City|               Email| First|Gender|      Last|Last IP Address|Months Customer|State|Total Orders|Total Purchased| geo| geolat|   geolon|
+-----------+--------------------+------+------+----------+---------------+---------------+-----+------------+---------------+----+-------+---------+
|   Syracuse|  afresco@dayrep.com|    Al|     M|    Fresco|  74.111.18.161|              1|   NY|           1|             45|null|42.9317| -76.5661|
|    Phoenix|     akuss@rhyta.com|  Abby|     F|      Kuss|  23.80.125.101|              2|   AZ|           1|             25|null|34.0522|-118.2437|
|     Newark|   aphoto@dayrep.com| Arial|     F|     Photo|     24.0.14.56|              1|   NJ|           1|            680|null|40.5576| -74.2846|
|    Raleigh|    balott@rhyta.com| Bette|     F|     Alott| 56.216.127.219|             18|   NC|   