# Init

Create a spark instance by connecting to the spark master

In [6]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from os.path import abspath

# SparkSession
URL_SPARK = "spark://spark-master:7077"
warehouse_location = abspath('spark-warehouse')

spark = (
    SparkSession.builder
    .appName("spark-ml")
    .config("executor.memory", "8g")
    .config("spark.sql.warehouse.dir", warehouse_location)
    .master(URL_SPARK)
    .getOrCreate()
)

# Raw data 

Load data from a file at path "./raw_mon_data/mon-vm.log" which collects 
- Timestamp
- Memory
- CPU
- Storage

into a dataframe 

In [7]:
path = "./raw_mon_data/mon-vm.log"
df = spark.read.text(path)
df.printSchema()

root
 |-- value: string (nullable = true)



In [8]:
from pyspark.sql.functions import *

df1 = df.withColumn('timestamp', regexp_extract('value', r'timestamp:\s(.*),\sused_memory', 1)) \
        .withColumn('cpu', regexp_extract('value', r'used_cpu:\s(.*)\%', 1)) \
        .withColumn('memory', regexp_extract('value', r'used_memory:\s(.*)\%\s+,\sused_storage', 1)) \
        .withColumn('storage', regexp_extract('value', r'used_storage:\s(.*)\%\s+,\sused_cpu', 1))

df2 = df1.drop('value')
df2.printSchema()
df2.count()

root
 |-- timestamp: string (nullable = true)
 |-- cpu: string (nullable = true)
 |-- memory: string (nullable = true)
 |-- storage: string (nullable = true)



                                                                                

1381

[Stage 3:>                                                          (0 + 1) / 1]

In [14]:
spark.sql("CREATE DATABASE IF NOT EXISTS mondb1")

DataFrame[]

In [15]:
df2.write.mode('overwrite').saveAsTable("mondb1.silver_mon")

[Stage 3:>                                                          (0 + 1) / 1]

In [16]:
my_df = spark.sql('SELECT * FROM mondb1.silver_mon')

In [17]:
my_df.show()

+--------------------+----+------+-------+
|           timestamp| cpu|memory|storage|
+--------------------+----+------+-------+
|06-06-23 07:52:17 PM|0.12| 36.06|     25|
|06-06-23 07:52:22 PM|0.11| 35.99|     25|
|06-06-23 07:52:27 PM|0.10| 35.99|     25|
|06-06-23 07:52:40 PM|0.09| 35.98|     25|
|06-06-23 07:52:45 PM|0.07| 35.98|     25|
|06-06-23 07:52:50 PM|0.07| 35.97|     25|
|06-06-23 07:52:55 PM|0.06| 36.03|     25|
|06-06-23 07:53:00 PM|0.06| 35.99|     25|
|06-06-23 07:53:06 PM|0.05| 35.99|     25|
|06-06-23 07:53:11 PM|0.05| 35.99|     25|
|06-06-23 07:53:16 PM|0.04| 35.99|     25|
|06-06-23 07:53:21 PM|0.04| 35.99|     25|
|06-06-23 07:53:26 PM|0.04| 35.99|     25|
|06-06-23 07:53:32 PM|0.03| 35.99|     25|
|06-06-23 07:53:37 PM|0.03| 35.99|     25|
|06-06-23 07:53:42 PM|0.03| 35.99|     25|
|06-06-23 07:53:47 PM|0.02| 35.99|     25|
|06-06-23 07:53:52 PM|0.02| 35.99|     25|
|06-06-23 07:53:58 PM|0.02| 35.99|     25|
|06-06-23 07:54:03 PM|0.02| 35.99|     25|
+----------

[Stage 3:>                                                          (0 + 1) / 1]

In [19]:
exit()