# Init Spark session

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from os.path import abspath
import os

# SparkSession
URL_SPARK = "spark://spark-master:7077"
warehouse_location = './spark-warehouse'

spark = (
    SparkSession.builder
    .appName("spark-ml-multiVM")
    .config("executor.memory", "8g")
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("spark.jars", "jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/kafka-clients-2.1.1.jar,jars/spark-streaming-kafka-0-10-assembly_2.12-3.2.1.jar,jars/commons-pool2-2.11.1.jar")
    .master(URL_SPARK)
    .getOrCreate()
)

/usr/local/lib/python3.9/dist-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found
23/07/31 18:36:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# We have 2 streams from 2 producers publishing the data on 2 topics. We will read 2 stream messages into sparks using spark streaming

![Drag Racing](./images/kafka-spark-streaming2.png)


# Stream 1: Stream raw data of vm1 from kafka 
- Here we read the stream from kafka topic vm-stat-stream (acumos server) 


In [2]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "vm-stat-stream") \
    .option("startingOffsets", "earliest") \
    .load()

In [3]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [4]:
stringDF = df.selectExpr("CAST(value AS STRING)")

In [None]:
# stringDF.writeStream.format('console').start()?

In [5]:
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp, from_unixtime

In [6]:
from pyspark.sql.functions import *

df_vm1 = stringDF.withColumn('timestamp', regexp_extract('value', r'timestamp:\s(.*),\shostname', 1)) \
        .withColumn('cpu1', regexp_extract('value', r'used_cpu:\s(.*)\%', 1)) \
        .withColumn('memory1', regexp_extract('value', r'used_memory:\s(.*)\%,\sused_storage', 1)) \
        .withColumn('storage1', regexp_extract('value', r'used_storage:\s(.*)\%,\sused_cpu', 1))

df_vm1 = df_vm1.drop('value')
df_vm1.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- cpu1: string (nullable = true)
 |-- memory1: string (nullable = true)
 |-- storage1: string (nullable = true)



In [7]:
df_vm1_2 = df_vm1.withColumn(
  'timestamp',
  from_unixtime(unix_timestamp("timestamp","dd-MM-yy hh:mm:ss a"),"yyyy-MM-dd HH:mm:ss").cast(TimestampType())
)

In [None]:
df_vm1_2.writeStream.format('console').start()

In [8]:
df_vm1_water = df_vm1_2.withWatermark('timestamp','10 minutes')

In [9]:
df_vm1_water.writeStream.format('console').start()

23/07/31 18:37:17 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-ba85c8f3-3062-49b0-9fd4-b285d3c2e8f5. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/07/31 18:37:17 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7fbcec437fd0>

# Stream 2: Stream raw data of vm1 from kafka 
- Here we read the stream from kafka topic vm-stat-stream-2 (acumos server) into stringDF2 

In [10]:
df2 = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "vm-stat-stream-2") \
    .option("startingOffsets", "earliest") \
    .load()

[Stage 0:>                                                          (0 + 1) / 1]

In [11]:
stringDF2 = df2.selectExpr("CAST(value AS STRING)")

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+----+-------+--------+
|timestamp|cpu1|memory1|storage1|
+---------+----+-------+--------+
|     null|0.43|  38.86|      56|
|     null|0.51|  38.85|      56|
|     null|0.39|  38.98|      56|
|     null|0.49|  38.98|      56|
|     null|0.30|  38.98|      56|
|     null|0.95|  38.98|      56|
|     null|0.81|  38.99|      56|
|     null|0.49|  38.99|      56|
|     null|0.30|  38.99|      56|
|     null|0.18|  38.86|      56|
|     null|0.11|  39.00|      56|
|     null|0.73|  38.99|      56|
|     null|0.88|  38.99|      56|
|     null|0.82|  39.00|      56|
|     null|1.07|  38.99|      56|
|     null|1.13|  39.01|      56|
|     null|1.01|  39.00|      56|
|     null|0.61|  38.87|      56|
|     null|0.47|  38.88|      56|
|     null|0.34|  38.87|      56|
+---------+----+-------+--------+
only showing top 20 rows

-------------------------------------------
Batch: 1
--------

                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:37:29|2.73|  37.67|      60|
+-------------------+----+-------+--------+



[Stage 3:>                                                          (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:37:34|2.83|  38.16|      60|
+-------------------+----+-------+--------+



[Stage 4:>                                                          (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:37:39|2.68|  38.23|      60|
+-------------------+----+-------+--------+



[Stage 5:>                                                          (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:37:45|2.71|  38.26|      60|
+-------------------+----+-------+--------+



[Stage 6:>                                                          (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:37:50|2.57|  38.27|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 7
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:37:55|2.45|  38.29|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 8
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:38:00|2.44|  38.32|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Ba

[Stage 11:>                                                         (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:38:16|2.49|  38.32|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 12
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:38:21|2.37|  38.37|      60|
+-------------------+----+-------+--------+



[Stage 13:>                                                         (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:38:26|2.18|  38.38|      60|
+-------------------+----+-------+--------+



In [12]:
df_vm2 = stringDF2.withColumn('timestamp', regexp_extract('value', r'timestamp:\s(.*),\shostname', 1)) \
        .withColumn('cpu2', regexp_extract('value', r'used_cpu:\s(.*)\%', 1)) \
        .withColumn('memory2', regexp_extract('value', r'used_memory:\s(.*)\%,\sused_storage', 1)) \
        .withColumn('storage2', regexp_extract('value', r'used_storage:\s(.*)\%,\sused_cpu', 1))

df_vm2 = df_vm2.drop('value')

[Stage 14:>                                                         (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 14
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:38:32|2.01|  38.37|      60|
+-------------------+----+-------+--------+



[Stage 15:>                                                         (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 15
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:38:37|1.85|  38.38|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 16
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:38:42|1.86|  38.36|      60|
+-------------------+----+-------+--------+



[Stage 17:>                                                         (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 17
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:38:47|1.79|  38.39|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 18
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:38:53|1.81|  38.39|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 19
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:38:58|1.66|  38.42|      60|
+-------------------+----+-------+--------+

-------------------------------------------

[Stage 23:>                                                         (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 23
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:39:19|1.39|  38.48|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 24
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:39:24|1.28|  38.48|      60|
+-------------------+----+-------+--------+



[Stage 25:>                                                         (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 25
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:39:29|1.18|  38.48|      60|
+-------------------+----+-------+--------+



[Stage 26:>                                                         (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 26
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:39:34|1.08|  38.48|      60|
+-------------------+----+-------+--------+



In [13]:
df_vm2_2 = df_vm2.withColumn(
  'timestamp',
  from_unixtime(unix_timestamp("timestamp","dd-MM-yy hh:mm:ss a"),"yyyy-MM-dd HH:mm:ss").cast(TimestampType())
)
df_vm2_2 = df_vm2_2.withColumnRenamed("timestamp","timestamp2")

In [14]:
df_vm2_water = df_vm2_2.withWatermark('timestamp2','10 minutes')

[Stage 27:>                                                         (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 27
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:39:40|1.08|  38.52|      60|
+-------------------+----+-------+--------+



In [15]:
df_vm2_water.writeStream.format('console').start()

23/07/31 18:39:41 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-824ad79b-bd18-4e3d-ab48-33b6aef492c3. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/07/31 18:39:41 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7fbcdcde8e50>

-------------------------------------------
Batch: 0
-------------------------------------------
+----------+----+-------+--------+
|timestamp2|cpu2|memory2|storage2|
+----------+----+-------+--------+
|      null|0.43|  38.86|      56|
|      null|0.51|  38.85|      56|
|      null|0.39|  38.98|      56|
|      null|0.49|  38.98|      56|
|      null|0.30|  38.98|      56|
|      null|0.95|  38.98|      56|
|      null|0.81|  38.99|      56|
|      null|0.49|  38.99|      56|
|      null|0.30|  38.99|      56|
|      null|0.18|  38.86|      56|
|      null|0.11|  39.00|      56|
|      null|0.73|  38.99|      56|
|      null|0.88|  38.99|      56|
|      null|0.82|  39.00|      56|
|      null|1.07|  38.99|      56|
|      null|1.13|  39.01|      56|
|      null|1.01|  39.00|      56|
|      null|0.61|  38.87|      56|
|      null|0.47|  38.88|      56|
|      null|0.34|  38.87|      56|
+----------+----+-------+--------+
only showing top 20 rows



[Stage 29:>                                                         (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 28
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:39:45|1.39|  38.52|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-------------------+----+-------+--------+
|         timestamp2|cpu2|memory2|storage2|
+-------------------+----+-------+--------+
|2023-07-31 13:39:45|1.39|  38.52|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 2
-------------------------------------------
+-------------------+----+-------+--------+
|         timestamp2|cpu2|memory2|storage2|
+-------------------+----+-------+--------+
|2023-07-31 13:39:50|1.28|  38.53|      60|
+-------------------+----+-------+--------+

-------------------------------------------
B

# Join two stream data into one stream 

In [16]:
df_join_water = df_vm1_water.join(df_vm2_water,expr("""
    timestamp = timestamp2 AND
    timestamp2 >= timestamp AND
    timestamp2 <= timestamp + interval 1 hour
    """),"leftOuter")

-------------------------------------------
Batch: 32
-------------------------------------------
-------------------------------------------
Batch: 5
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:40:06|1.13|  38.55|      60|
+-------------------+----+-------+--------+

+-------------------+----+-------+--------+
|         timestamp2|cpu2|memory2|storage2|
+-------------------+----+-------+--------+
|2023-07-31 13:40:06|1.13|  38.55|      60|
+-------------------+----+-------+--------+



In [None]:
df_join_water.writeStream.format('console').start()

In [17]:
df_join = df_vm1.join(df_vm2, 'timestamp' )

[Stage 39:>                 (0 + 1) / 1][Stage 40:>                 (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 33
-------------------------------------------
-------------------------------------------
Batch: 6
-------------------------------------------
+-------------------+----+-------+--------+
|         timestamp2|cpu2|memory2|storage2|
+-------------------+----+-------+--------+
|2023-07-31 13:40:11|1.04|  38.58|      60|
+-------------------+----+-------+--------+

+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:40:11|1.04|  38.58|      60|
+-------------------+----+-------+--------+



In [18]:
df_join.writeStream.format('console').start()

23/07/31 18:40:16 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-f4390a75-52b8-4183-ada8-365b72caa84e. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/07/31 18:40:16 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7fbcdcdd4610>

[Stage 41:>                 (0 + 1) / 1][Stage 42:>                 (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
-------------------------------------------
Batch: 34
-------------------------------------------
+-------------------+----+-------+--------+
|         timestamp2|cpu2|memory2|storage2|
+-------------------+----+-------+--------+
|2023-07-31 13:40:16|0.95|  38.58|      60|
+-------------------+----+-------+--------+

+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:40:16|0.95|  38.58|      60|
+-------------------+----+-------+--------+



[Stage 43:>                 (0 + 1) / 1][Stage 44:>                 (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 35
-------------------------------------------
-------------------------------------------
Batch: 8
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:40:21|0.88|  38.61|      60|
+-------------------+----+-------+--------+

+-------------------+----+-------+--------+
|         timestamp2|cpu2|memory2|storage2|
+-------------------+----+-------+--------+
|2023-07-31 13:40:21|0.88|  38.61|      60|
+-------------------+----+-------+--------+



[Stage 47:>                                                       (0 + 4) / 200]

# Publish joined stream data into topic 3 ('output-join-stat') in Kafka broker

In [19]:
nested_struct = struct(df_join.timestamp, df_join.cpu1, df_join.memory1, df_join.cpu2, df_join.memory2)

[Stage 47:>(50 + 4) / 200][Stage 48:>   (0 + 0) / 1][Stage 49:>   (0 + 0) / 1]0]

In [20]:
df_out = df_join.withColumn('value', to_json(nested_struct))

                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+-------------------+----+-------+--------+
|         timestamp2|cpu2|memory2|storage2|
+-------------------+----+-------+--------+
|2023-07-31 13:40:27|1.53|  39.23|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 36
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:40:27|1.53|  39.23|      60|
+-------------------+----+-------+--------+



In [21]:
df_out.selectExpr("CAST(value AS STRING)") \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:29092") \
  .option("checkpointLocation", "./spark-warehouse/join-stream-kafka/checkpoint") \
  .option("topic", "output-join-stat") \
  .start()

-------------------------------------------
Batch: 37
-------------------------------------------
-------------------------------------------
Batch: 10
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:40:32|2.61|  39.62|      60|
+-------------------+----+-------+--------+

+-------------------+----+-------+--------+
|         timestamp2|cpu2|memory2|storage2|
+-------------------+----+-------+--------+
|2023-07-31 13:40:32|2.61|  39.62|      60|
+-------------------+----+-------+--------+



[Stage 47:(102 + 2) / 200][Stage 50:>   (0 + 1) / 1][Stage 51:>   (0 + 1) / 1]                                                                                23/07/31 18:40:33 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7fbcec444460>

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------


[Stage 54:>(18 + 4) / 200][Stage 55:>   (0 + 0) / 1][Stage 56:>   (0 + 0) / 1]

+--------------------+----+-------+--------+----+-------+--------+
|           timestamp|cpu1|memory1|storage1|cpu2|memory2|storage2|
+--------------------+----+-------+--------+----+-------+--------+
|31-07-23 01:18:55 PM|0.40|  38.85|      59|0.40|  38.85|      59|
|31-07-23 01:36:31 PM|5.86|  33.55|      60|5.86|  33.55|      60|
|31-07-23 01:28:43 PM|7.75|  40.87|      60|7.75|  40.87|      60|
|31-07-23 01:19:05 PM|0.50|  38.89|      59|0.50|  38.89|      59|
|31-07-23 01:21:05 PM|2.25|  39.90|      59|2.25|  39.90|      59|
|31-07-23 01:22:13 PM|5.17|  40.32|      59|5.17|  40.32|      59|
|31-07-23 01:23:10 PM|5.73|  40.37|      59|5.73|  40.37|      59|
|31-07-23 01:34:27 PM|7.40|  41.17|      60|7.40|  41.17|      60|
|31-07-23 01:15:57 PM|0.71|  36.53|      59|0.71|  36.53|      59|
|31-07-23 01:34:21 PM|7.70|  41.16|      60|7.70|  41.16|      60|
|31-07-23 01:19:00 PM|0.36|  38.86|      59|0.36|  38.86|      59|
|31-07-23 01:32:32 PM|7.91|  41.03|      60|7.91|  41.03|     

                                                                                

-------------------------------------------
Batch: 38
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:40:37|3.04|  39.80|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 11
-------------------------------------------
+-------------------+----+-------+--------+
|         timestamp2|cpu2|memory2|storage2|
+-------------------+----+-------+--------+
|2023-07-31 13:40:37|3.04|  39.80|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 39
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:40:42|3.43|  40.06|      60|
|2023-07-31 13:40:47|3.64|  40.15|      60|
+-------------------+----+-------+--------+


                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
-------------------------------------------
Batch: 40
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:40:53|4.07|  40.26|      60|
+-------------------+----+-------+--------+

+-------------------+----+-------+--------+
|         timestamp2|cpu2|memory2|storage2|
+-------------------+----+-------+--------+
|2023-07-31 13:40:53|4.07|  40.26|      60|
+-------------------+----+-------+--------+



                                                                                

-------------------------------------------
Batch: 14
-------------------------------------------
+-------------------+----+-------+--------+
|         timestamp2|cpu2|memory2|storage2|
+-------------------+----+-------+--------+
|2023-07-31 13:40:58|4.95|  40.43|      60|
+-------------------+----+-------+--------+

-------------------------------------------
Batch: 41
-------------------------------------------
+-------------------+----+-------+--------+
|          timestamp|cpu1|memory1|storage1|
+-------------------+----+-------+--------+
|2023-07-31 13:40:58|4.95|  40.43|      60|
+-------------------+----+-------+--------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+----+-------+--------+----+-------+--------+
|           timestamp|cpu1|memory1|storage1|cpu2|memory2|storage2|
+--------------------+----+-------+--------+----+-------+--------+
|31-07-23 01:40:32 PM|2.61|  39.62|      60|2.61|  39.62|      60|
|31-07-23 01:40:37 PM|3.04|  39.80|      60|3.04|  39.80|      60|
|31-07-23 01:40:27 PM|1.53|  39.23|      60|1.53|  39.23|      60|
|31-07-23 01:40:21 PM|0.88|  38.61|      60|0.88|  38.61|      60|
+--------------------+----+-------+--------+----+-------+--------+



[Stage 66:>(96 + 4) / 200][Stage 69:>   (0 + 0) / 1][Stage 70:>   (0 + 0) / 1]0]

# Create consumer to read the joined DF in the topic 3 and make the predictions using latest stream data 
![Drag Racing](./images/kafka-predictions1.png)


In [None]:
# import pyspark
# import pyspark.pandas as ps
# import pandas as pd

# #convert spark dataframe to pandas for more visualization
# n_vm = 2
# df_dict={}
# df_dict['vm1'] =  df.toPandas()
# df_dict['vm2'] = df2.toPandas() 

In [None]:
# # rename columns of two dataframe since now they have the same column names
# for i in range(0,n_vm):
#     df_dict['vm'+str(i+1)] = df_dict['vm'+str(i+1)].rename(columns={"cpu": "cpu_vm"+str(i+1), "memory": "memory_vm"+str(i+1),"storage": "storage_vm"+str(i+1)})
#     df_dict['vm'+str(i+1)]['timestamp'] = pd.to_datetime(df_dict['vm'+str(i+1)]['timestamp'],format='%d-%m-%y %I:%M:%S %p').dt.strftime('%Y-%m-%d %H:%M:%S')
#     df_dict['vm'+str(i+1)]['timestamp']= pd.to_datetime(df_dict['vm'+str(i+1)]['timestamp'])
#     df_dict['vm'+str(i+1)].set_index('timestamp',inplace=True)

In [None]:
# join two time series using time stamp index union and sort the index of combined data frame according to time stamp
# combined_df = df_dict['vm1'].join(df_dict['vm2'],how='outer')

In [None]:
# combined_df = combined_df.sort_index()

In [None]:
# combined_df = combined_df.apply(pd.to_numeric, errors='ignore')
# filled_df = combined_df.interpolate(method='ffill').interpolate(method='bfill')

In [None]:
# cols=[]
# for i in range(n_vm):
#     cols.append('storage_vm'+str(i+1))
# clean_df = filled_df.drop(columns=cols)
# clean_df.head()

In [None]:
# print('total number of missing values in clean dataframe:',clean_df.isna().sum())
# minute_df = clean_df.resample('1T').mean()
# nan_count = minute_df.isna().sum()
# print('total number of missing values in reampled dataframe:',nan_count)
# minute_df = minute_df.fillna(method='ffill')
# nan_count = minute_df.isna().sum()
# print('total number of missing values in filled reampled dataframe:',nan_count)

In [None]:
# test_df = minute_df[-40:]

# Make prediction
- Registered model is ready deployed and the url to access the serve model is 'http://mlflowserve:5000/invocations'.
- We construct a REST API call by using package requests of python to send the input X to retrieve the predicted y as follow

In this example:
- X must be an array which contains (n,input_steps,features) where number of features for the case of 2 VMs are 4
- body data must be converted to json using json dumps with the fields 'inputs'

In [None]:
# import numpy as np
# test_df_np = np.array(test_df)
# test_input_np = np.expand_dims(test_df_np[0:30],axis=0)
# print(test_input_np.shape)
# test_input_list = test_input_np.tolist()
# test_label_np = np.expand_dims(test_df_np[30:,[0,2]],axis=0)
# print('test label shape:',test_label_np.shape)

In [None]:
# import json
# import requests

# url = 'http://mlflowserve:5000/invocations'

# headers = {'Content-Type': 'application/json'}
# request_data = json.dumps({"inputs": test_input_list})
# response = requests.post(url,request_data, headers=headers)

In [None]:
# json_response = json.loads(response.content)
# json_response['predictions']

In [None]:
# import matplotlib.pyplot as plt
# max_subplots = 2
# plot_col = 'cpu'
# max_n = max_subplots
# shift = 10
# predictions = np.array(json_response['predictions'])
# print(predictions.shape)
# label_indices = np.arange(predictions.shape[1])
# for n in range(max_n):
#     plt.subplot(max_n, 1, n+1)
#     plt.ylabel(f'{plot_col}')
#     plt.plot(label_indices, test_label_np[0, :, n],
#                 marker='^', label='Labels vm'+str(n+1))
#     plt.plot(label_indices,  predictions[0, :, n],
#                 label='prediction vm'+str(n+1), marker='x')
#     plt.legend()