# Init Spark session

In [54]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from os.path import abspath
import os

# SparkSession
URL_SPARK = "spark://spark-master:7077"
warehouse_location = './spark-warehouse'

spark = (
    SparkSession.builder
    .appName("spark-ml-multiVM")
    .config("executor.memory", "8g")
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("spark.jars", "jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/kafka-clients-2.1.1.jar,jars/spark-streaming-kafka-0-10-assembly_2.12-3.2.1.jar,jars/commons-pool2-2.11.1.jar")
    .master(URL_SPARK)
    .getOrCreate()
)

# Stream 1: Stream raw data of vm1 from kafka 
- Here we read the stream from kafka topic vm-stat-stream (acumos server) into stringDF and right away write this stream to a parquet format in the bronze domain at location ./spark-warehouse/stream_bronze_vm1
- Then we transform the raw stream into cleaner version df_vm1 and write this stream to a parquet format in the silver domain at location  ./spark-warehouse/stream_silver_vm1

In [55]:
df1 = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "vm-stat-stream") \
    .option("startingOffsets", "earliest") \
    .load()

In [56]:
df1.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [57]:
stringDF = df1.selectExpr("CAST(value AS STRING)")

In [58]:
# Syntax to write stream to parquet

# stringDF.writeStream.format("parquet") \
#    .outputMode("append") \
#    .option("checkpointLocation", "./spark-warehouse/stream_bronze_vm1/checkpoint") \
#    .option("path", "/usr/bin/spark-3.2.4-bin-hadoop2.7/spark-warehouse/1/stream_bronze_vm1/data") \
#    .start()

In [59]:
from pyspark.sql.functions import *

df_vm1 = stringDF.withColumn('timestamp', regexp_extract('value', r'timestamp:\s(.*),\sused_memory', 1)) \
        .withColumn('cpu1', regexp_extract('value', r'used_cpu:\s(.*)\%', 1)) \
        .withColumn('memory1', regexp_extract('value', r'used_memory:\s(.*)\%,\sused_storage', 1)) \
        .withColumn('storage1', regexp_extract('value', r'used_storage:\s(.*)\%,\sused_cpu', 1))

df_vm1 = df_vm1.drop('value')
df_vm1.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- cpu1: string (nullable = true)
 |-- memory1: string (nullable = true)
 |-- storage1: string (nullable = true)



In [60]:
# df_vm1.writeStream.format("parquet") \
#    .outputMode("append") \
#    .option("checkpointLocation", "./spark-warehouse/stream_silver_vm1/checkpoint") \
#    .option("path", "/usr/bin/spark-3.2.4-bin-hadoop2.7/spark-warehouse/1/stream_silver_vm1/data") \
#    .start()

# Stream 2: Stream raw data of vm1 from kafka 
- Here we read the stream from kafka topic vm-stat-stream-2 (acumos server) into stringDF2 and right away write this stream to a parquet format in the bronze domain at location ./spark-warehouse/stream_bronze_vm2
- Then we transform the raw stream into cleaner version df_vm2 and write this stream to a parquet format in the silver domain at location  ./spark-warehouse/stream_silver_vm2

In [61]:
df2 = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "vm-stat-stream-2") \
    .option("startingOffsets", "earliest") \
    .load()

In [62]:
stringDF2 = df2.selectExpr("CAST(value AS STRING)")

In [63]:
# Syntax to write stream to parquet

# stringDF2.writeStream.format("parquet") \
#    .outputMode("append") \
#    .option("checkpointLocation", "./spark-warehouse/stream_bronze_vm2/checkpoint") \
#    .option("path", "/usr/bin/spark-3.2.4-bin-hadoop2.7/spark-warehouse/1/stream_bronze_vm2/data") \
#    .start()

In [64]:
df_vm2 = stringDF2.withColumn('timestamp', regexp_extract('value', r'timestamp:\s(.*),\sused_memory', 1)) \
        .withColumn('cpu2', regexp_extract('value', r'used_cpu:\s(.*)\%', 1)) \
        .withColumn('memory2', regexp_extract('value', r'used_memory:\s(.*)\%,\sused_storage', 1)) \
        .withColumn('storage2', regexp_extract('value', r'used_storage:\s(.*)\%,\sused_cpu', 1))

df_vm2 = df_vm2.drop('value')

In [65]:
# df_vm2.writeStream.format("parquet") \
#    .outputMode("append") \
#    .option("checkpointLocation", "./spark-warehouse/stream_silver_vm2/checkpoint") \
#    .option("path", "/usr/bin/spark-3.2.4-bin-hadoop2.7/spark-warehouse/1/stream_silver_vm2/data") \
#    .start()

In [66]:
df_join = df_vm1.join(df_vm2, 'timestamp')

In [67]:
df_join.writeStream.format("parquet") \
   .outputMode("append") \
   .option("checkpointLocation", "./spark-warehouse/join_stream/checkpoint") \
   .option("path", "/usr/bin/spark-3.2.4-bin-hadoop2.7/spark-warehouse/3/join_stream/data") \
   .start()

df_join.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- cpu1: string (nullable = true)
 |-- memory1: string (nullable = true)
 |-- storage1: string (nullable = true)
 |-- cpu2: string (nullable = true)
 |-- memory2: string (nullable = true)
 |-- storage2: string (nullable = true)



23/06/29 04:45:00 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [68]:
# from pyspark.sql.functions import *
# path2 = './spark-warehouse/mondb.db/bronze_vmlog_maas_controller'
# df2 = spark.read.format('parquet').options(header=True,inferSchema=True).load(path)

In [69]:
from pyspark.sql.types import *

silverSchema = StructType([ \
    StructField("timestamp",StringType(),True), \
    StructField("cpu1",StringType(),True), \
    StructField("memory1",StringType(),True), \
    StructField("storage1", StringType(), True), \
    StructField("cpu2",StringType(),True), \
    StructField("memory2",StringType(),True), \
    StructField("storage2", StringType(), True), \
  ])

In [70]:
df3 = spark.readStream.schema(silverSchema).load("/usr/bin/spark-3.2.4-bin-hadoop2.7/spark-warehouse/3/join_stream/data/")

In [71]:
# df3.writeStream.format('console').start()

In [72]:
df3.selectExpr("CAST(timestamp AS STRING)", "CAST(cpu1 AS STRING)", "CAST(memory1 AS STRING)", "CAST(storage1 AS STRING)", "CAST(cpu2 AS STRING)", "CAST(memory2 AS STRING)", "CAST(storage2 AS STRING)") \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:29092") \
  .option("topic", "output-join-") \
  .start()

23/06/29 04:45:00 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7fa7fce5d9d0>

In [73]:
# df_join.show()

# Join data of multiVM, fill missing values and clean the dataframe (drop storage features)

In [74]:
# import pyspark
# import pyspark.pandas as ps
# import pandas as pd

# #convert spark dataframe to pandas for more visualization
# n_vm = 2
# df_dict={}
# df_dict['vm1'] =  df.toPandas()
# df_dict['vm2'] = df2.toPandas() 

In [75]:
# # rename columns of two dataframe since now they have the same column names
# for i in range(0,n_vm):
#     df_dict['vm'+str(i+1)] = df_dict['vm'+str(i+1)].rename(columns={"cpu": "cpu_vm"+str(i+1), "memory": "memory_vm"+str(i+1),"storage": "storage_vm"+str(i+1)})
#     df_dict['vm'+str(i+1)]['timestamp'] = pd.to_datetime(df_dict['vm'+str(i+1)]['timestamp'],format='%d-%m-%y %I:%M:%S %p').dt.strftime('%Y-%m-%d %H:%M:%S')
#     df_dict['vm'+str(i+1)]['timestamp']= pd.to_datetime(df_dict['vm'+str(i+1)]['timestamp'])
#     df_dict['vm'+str(i+1)].set_index('timestamp',inplace=True)

In [76]:
# join two time series using time stamp index union and sort the index of combined data frame according to time stamp
# combined_df = df_dict['vm1'].join(df_dict['vm2'],how='outer')

23/06/29 04:45:00 ERROR MicroBatchExecution: Query [id = b51fc1b4-4973-47ef-b829-64413c807941, runId = b47e4446-ffd5-4dbf-bcb3-d2a30b8ed535] terminated with error
org.apache.spark.sql.AnalysisException: Required attribute 'value' not found
	at org.apache.spark.sql.kafka010.KafkaWriter$.validateQuery(KafkaWriter.scala:59)
	at org.apache.spark.sql.kafka010.KafkaStreamingWrite.<init>(KafkaStreamingWrite.scala:42)
	at org.apache.spark.sql.kafka010.KafkaWrite.toStreaming(KafkaWrite.scala:39)
	at org.apache.spark.sql.execution.streaming.StreamExecution.createStreamingWrite(StreamExecution.scala:607)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution.logicalPlan$lzycompute(MicroBatchExecution.scala:140)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution.logicalPlan(MicroBatchExecution.scala:59)
	at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:295)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$s

In [77]:
# combined_df = combined_df.sort_index()

In [78]:
# combined_df = combined_df.apply(pd.to_numeric, errors='ignore')
# filled_df = combined_df.interpolate(method='ffill').interpolate(method='bfill')

In [79]:
# cols=[]
# for i in range(n_vm):
#     cols.append('storage_vm'+str(i+1))
# clean_df = filled_df.drop(columns=cols)
# clean_df.head()

# Resample and aggregrate the data by time index

In [80]:
# print('total number of missing values in clean dataframe:',clean_df.isna().sum())
# minute_df = clean_df.resample('1T').mean()
# nan_count = minute_df.isna().sum()
# print('total number of missing values in reampled dataframe:',nan_count)
# minute_df = minute_df.fillna(method='ffill')
# nan_count = minute_df.isna().sum()
# print('total number of missing values in filled reampled dataframe:',nan_count)

In [81]:
# test_df = minute_df[-40:]

# Make prediction
- Registered model is ready deployed and the url to access the serve model is 'http://mlflowserve:5000/invocations'.
- We construct a REST API call by using package requests of python to send the input X to retrieve the predicted y as follow

In this example:
- X must be an array which contains (n,input_steps,features) where number of features for the case of 2 VMs are 4
- body data must be converted to json using json dumps with the fields 'inputs'

In [82]:
# import numpy as np
# test_df_np = np.array(test_df)
# test_input_np = np.expand_dims(test_df_np[0:30],axis=0)
# print(test_input_np.shape)
# test_input_list = test_input_np.tolist()
# test_label_np = np.expand_dims(test_df_np[30:,[0,2]],axis=0)
# print('test label shape:',test_label_np.shape)

In [83]:
# import json
# import requests

# url = 'http://mlflowserve:5000/invocations'

# headers = {'Content-Type': 'application/json'}
# request_data = json.dumps({"inputs": test_input_list})
# response = requests.post(url,request_data, headers=headers)

In [84]:
# json_response = json.loads(response.content)
# json_response['predictions']

In [85]:
# import matplotlib.pyplot as plt
# max_subplots = 2
# plot_col = 'cpu'
# max_n = max_subplots
# shift = 10
# predictions = np.array(json_response['predictions'])
# print(predictions.shape)
# label_indices = np.arange(predictions.shape[1])
# for n in range(max_n):
#     plt.subplot(max_n, 1, n+1)
#     plt.ylabel(f'{plot_col}')
#     plt.plot(label_indices, test_label_np[0, :, n],
#                 marker='^', label='Labels vm'+str(n+1))
#     plt.plot(label_indices,  predictions[0, :, n],
#                 label='prediction vm'+str(n+1), marker='x')
#     plt.legend()

23/06/29 04:45:02 WARN TaskSetManager: Lost task 1.0 in stage 11.0 (TID 216) (172.24.0.11 executor 0): java.lang.IllegalStateException: Error reading delta file file:/opt/workspace/ML_workload_prediction_multiVM/spark-warehouse/join_stream/checkpoint/state/0/1/left-keyToNumValues/1.delta of HDFSStateStoreProvider[id = (op=0,part=1),dir = file:/opt/workspace/ML_workload_prediction_multiVM/spark-warehouse/join_stream/checkpoint/state/0/1/left-keyToNumValues]: file:/opt/workspace/ML_workload_prediction_multiVM/spark-warehouse/join_stream/checkpoint/state/0/1/left-keyToNumValues/1.delta does not exist
	at org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.updateFromDeltaFile(HDFSBackedStateStoreProvider.scala:461)
	at org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.$anonfun$loadMap$4(HDFSBackedStateStoreProvider.scala:417)
	at scala.runtime.java8.JFunction1$mcVJ$sp.apply(JFunction1$mcVJ$sp.java:23)
	at scala.collection.immutable.NumericR