In [1]:
import os
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from datetime import datetime
from operator import add
from operator import sub
from pyspark.sql.functions import *

In [2]:
# Check pyspark version, it should be "latest"
import pyspark
pyspark.__version__

'2.4.5'

In [3]:
#packages = "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.5"
packages = "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5"
os.environ["PYSPARK_SUBMIT_ARGS"] = (f"--packages {packages} pyspark-shell")
# THIS IS COMPULSORY
# Comment the line below if JAVA_HOME is already set up or you
# only have a single JVM version in your system
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# OPTIONAL: Check setup of environment variables
print("PYSPARK_SUBMIT_ARGS = ",os.environ["PYSPARK_SUBMIT_ARGS"],"\n")
print("JAVA_HOME = ", os.environ["JAVA_HOME"])

PYSPARK_SUBMIT_ARGS =  --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5 pyspark-shell 

JAVA_HOME =  /Library/Java/JavaVirtualMachines/jdk1.8.0_31.jdk/Contents/Home


In [4]:
spark = (SparkSession
    .builder
    .appName("StructuredStreamingUCI")
    .getOrCreate())

spark

In [5]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("sep", ",") \
    .option("subscribe", "test") \
    .load()

In [6]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [7]:
df = df.selectExpr('CAST(value AS STRING)')

df_data = df.select(
        split(df.value, ',')[0].alias("row").cast(StringType()),
        split(df.value, ',')[1].alias("date").cast(TimestampType()),
        split(df.value, ',')[2].alias("Temperature").cast(DoubleType()),
        split(df.value, ',')[3].alias("Humidity").cast(DoubleType()),
        split(df.value, ',')[4].alias("Light").cast(IntegerType()),
        split(df.value, ',')[5].alias("CO2").cast(DoubleType()),
        split(df.value, ',')[6].alias("HumidityRatio").cast(DoubleType()),
        split(df.value, ',')[7].alias("Occupancy").cast(IntegerType()))

In [8]:
df_data.printSchema()

root
 |-- row: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Humidity: double (nullable = true)
 |-- Light: integer (nullable = true)
 |-- CO2: double (nullable = true)
 |-- HumidityRatio: double (nullable = true)
 |-- Occupancy: integer (nullable = true)



Ejercicio 1: Calcular el promedio de valores de Temperatura, humedad relativa y concentración de CO2 para cada micro-batch y el promedio de dichos valores desde el arranque

In [33]:

result_1_1 = (df_data.agg(
                        avg(col("Temperature")).alias('MB-AVG Temperature'),
                        avg(col("Humidity")).alias('MB-AVG Humidity'),
                        avg(col("CO2")).alias('MB-AVG CO2')
                        )
                     .writeStream
                     .format('console')
                     .trigger(processingTime= '5 seconds')
                     .outputMode("update")
                     .start())
#result_1_1.awaitTermination()


In [35]:
result_1_2 = (df_data.agg(
                        avg(col("Temperature")).alias('AVG Temperature'),
                        avg(col("Humidity")).alias('AVG Humidity'),
                        avg(col("CO2")).alias('AVG CO2')
                        )
                 .writeStream
                 .format('console')
                 .trigger(processingTime= '5 seconds')
                 .outputMode("complete")
                 .start())

Ejercicio 2: Calcular el promedio de luminosidad en la estancia en ventanas deslizantes de tamaño 45
segundos, con un valor de deslizamiento de 15 segundos entre ventanas consecutivas.

In [15]:
result_2 = (df_data.groupBy(window(col("date"), "45 seconds", "15 seconds"))
                        .avg("Light")
                        .writeStream\
                        .format('console')\
                        .trigger(processingTime= '5 seconds')\
                        .outputMode("update")\
                        .start())


In [20]:
# with scala 
# val sums = levels.
#  groupBy(window($"time", "5 seconds")).
#  agg(sum("level") as "level_sum").
#  select("window.start", "window.end", "level_sum")

## 5.6 Stop session

In [39]:
spark.stop()

In [21]:
result_2.stop()