### Install and load packages

In [None]:
!pip install influxdb-client

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

from influxdb_client import InfluxDBClient, Point, WritePrecision
from influxdb_client.client.write_api import SYNCHRONOUS

import os
import json

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell'

### Spark Instance

In [None]:
spark = SparkSession.builder.master('local[*]').getOrCreate()
spark.sparkContext.setLogLevel('ERROR')

### Define an input stream

In [None]:
# Define a schema for the data frame

cols = ['Time',
 'HMI_FIT101', 'HMI_LIT101', 'HMI_AIT201', 'HMI_AIT202', 'HMI_AIT203', 'HMI_FIT201',
 'HMI_DPIT301', 'HMI_FIT301', 'HMI_LIT301', 'HMI_AIT401', 'HMI_AIT402', 'HMI_FIT401', 
 'HMI_LIT401', 'HMI_AIT501', 'HMI_AIT502', 'HMI_AIT503', 'HMI_AIT504', 'HMI_FIT501',
 'HMI_FIT502', 'HMI_FIT503', 'HMI_FIT504', 'HMI_PIT501', 'HMI_PIT502', 'HMI_PIT503', 
 'HMI_FIT601', 'HMI_MV101', 'HMI_P101', 'HMI_P102', 'HMI_MV201', 'HMI_P201', 'HMI_P202',
 'HMI_P203', 'HMI_P204', 'HMI_P205', 'HMI_P206','HMI_MV301', 'HMI_MV302', 'HMI_MV303',
 'HMI_MV304', 'HMI_P301', 'HMI_P302', 'HMI_P401', 'HMI_P402', 'HMI_P403', 'HMI_P404',
 'HMI_P501', 'HMI_P502', 'HMI_P601', 'HMI_P602', 'HMI_P603', 'HMI_UV401']

fields = [StructField(col_name, StringType(), True) for col_name in cols]
schema = StructType(fields)

In [None]:
# Read stream from json and fit schema

inputStream = spark\
    .readStream\
    .format("kafka")\
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "SWAT")\
    .load()

inputStream = inputStream.select(col("value").cast("string").alias("data"))
# inputStream = inputStream.select(from_json(col("value").cast("string"), schema).alias("data"))
inputStream.printSchema()
# data = inputStream.select("data.*")

### Batch Processing and Storage

In [None]:
from pyspark.sql import *

class InfluxDBWriter:
    def __init__(self):
        cloud_url = "https://westeurope-1.azure.cloud2.influxdata.com"
        token = "iJHZR-dq4I5LIgFZCc5jTUNx-I7dyz29ZTO-B4W5DpU4mhPVDFg-aAb2jK4Vz1C6n0DDb6ddA-bJ3EZAanAOUw=="
        org = "ahr9oi@inf.elte.hu"
        self.bucket = "ahr9oi's Bucket"
        
        self.client = InfluxDBClient(url=cloud_url, token=token, org=org)
        self.write_api = self.client.write_api()

    def open(self, partition_id, epoch_id):
        print("Opened %d, %d" % (partition_id, epoch_id))
        return True

    def process(self, row):
        self._row_to_line_protocol(row['data'])

    def close(self, error):
        self.write_api.__del__()
        self.client.__del__()
        print("Closed with error: %s" % str(error))

    def _row_to_line_protocol(self, row):
        # Map Row to LineProtocol
        # Row(Time='15/6/2017 7:10:01.322 AM', HMI_FIT101='0.0', HMI_LIT101='803.28', HMI_AIT201='157.59', HMI_AIT202='8.87', HMI_AIT203='267.03', HMI_FIT201='2.38', HMI_DPIT301='18.12', HMI_FIT301='2.05', HMI_LIT301='991.93', HMI_AIT401='0', HMI_AIT402='75.39', HMI_FIT401='1.64', HMI_LIT401='944.68', HMI_AIT501='7.69', HMI_AIT502='113.89', HMI_AIT503='255.32', HMI_AIT504='35.91', HMI_FIT501='1.65', HMI_FIT502='1.35', HMI_FIT503='0.61', HMI_FIT504='0.32', HMI_PIT501='262.59', HMI_PIT502='0.8', HMI_PIT503='185.03', HMI_FIT601='0.0', HMI_MV101='1', HMI_P101='2', HMI_P102='1', HMI_MV201='2', HMI_P201='2', HMI_P202='1', HMI_P203='2', HMI_P204='1', HMI_P205='2', HMI_P206='1', HMI_MV301='1', HMI_MV302='2', HMI_MV303='1', HMI_MV304='1', HMI_P301='2', HMI_P302='1', HMI_P401='2', HMI_P402='1', HMI_P403='1', HMI_P404='1', HMI_P501='2', HMI_P502='1', HMI_P601='1', HMI_P602='1', HMI_P603='1', HMI_UV401='2')
        sequence = []
        key_values = [f"{key}={val}" for key, val in json.loads(row).items()]
        for element in key_values:
            sequence.append(f"swat,anomaly=false {element}")
        print(sequence)

### Read stream and process

In [None]:
# 'swat,anomaly=false Time=16/6/2017 9:17:49.444 AM,', 'swat,anomaly=false HMI_FIT101=0.0,', 'swat,anomaly=false HMI_LIT101=818.93,', 'swat,anomaly=false HMI_AIT201=178.16,', 'swat,anomaly=false HMI_AIT202=7.54,', 'swat,anomaly=false HMI_AIT203=361.83,', 'swat,anomaly=false HMI_FIT201=0.0,', 'swat,anomaly=false HMI_DPIT301=2.75,', 'swat,anomaly=false HMI_FIT301=0.0,', 'swat,anomaly=false HMI_LIT301=1009.84,', 'swat,anomaly=false HMI_AIT401=0,', 'swat,anomaly=false HMI_AIT402=172.47,', 'swat,anomaly=false HMI_FIT401=1.64,', 'swat,anomaly=false HMI_LIT401=889.75,', 'swat,anomaly=false HMI_AIT501=7.17,', 'swat,anomaly=false HMI_AIT502=203.38,', 'swat,anomaly=false HMI_AIT503=262.66,', 'swat,anomaly=false HMI_AIT504=37.11,', 'swat,anomaly=false HMI_FIT501=1.65,', 'swat,anomaly=false HMI_FIT502=1.34,', 'swat,anomaly=false HMI_FIT503=0.61,', 'swat,anomaly=false HMI_FIT504=0.32,', 'swat,anomaly=false HMI_PIT501=262.64,', 'swat,anomaly=false HMI_PIT502=1.51,', 'swat,anomaly=false HMI_PIT503=184.68,', 'swat,anomaly=false HMI_FIT601=0.0,', 'swat,anomaly=false HMI_MV101=1,', 'swat,anomaly=false HMI_P101=1,', 'swat,anomaly=false HMI_P102=1,', 'swat,anomaly=false HMI_MV201=1,', 'swat,anomaly=false HMI_P201=1,', 'swat,anomaly=false HMI_P202=1,', 'swat,anomaly=false HMI_P203=1,', 'swat,anomaly=false HMI_P204=1,', 'swat,anomaly=false HMI_P205=1,', 'swat,anomaly=false HMI_P206=1,', 'swat,anomaly=false HMI_MV301=1,', 'swat,anomaly=false HMI_MV302=1,', 'swat,anomaly=false HMI_MV303=1,', 'swat,anomaly=false HMI_MV304=1,', 'swat,anomaly=false HMI_P301=1,', 'swat,anomaly=false HMI_P302=1,', 'swat,anomaly=false HMI_P401=2,', 'swat,anomaly=false HMI_P402=1,', 'swat,anomaly=false HMI_P403=1,', 'swat,anomaly=false HMI_P404=1,', 'swat,anomaly=false HMI_P501=2,', 'swat,anomaly=false HMI_P502=1,', 'swat,anomaly=false HMI_P601=1,', 'swat,anomaly=false HMI_P602=1,', 'swat,anomaly=false HMI_P603=1,', 'swat,anomaly=false HMI_UV401=2,'

In [None]:
# 3. Output data: show result in the console
query = (inputStream
         .writeStream
         .outputMode("append")
         .foreach(InfluxDBWriter())
         .start())

query.awaitTermination()