In [1]:
import os
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import *
import pyspark.sql.functions as func
from datetime import datetime
from operator import add
from operator import sub
from pyspark.sql.functions import *

In [2]:
# Check pyspark version, it should be "latest"
import pyspark
pyspark.__version__

'2.4.5'

In [3]:
#packages = "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.5"
packages = "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5"
os.environ["PYSPARK_SUBMIT_ARGS"] = (f"--packages {packages} pyspark-shell")
# THIS IS COMPULSORY
# Comment the line below if JAVA_HOME is already set up or you
# only have a single JVM version in your system
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# OPTIONAL: Check setup of environment variables
print("PYSPARK_SUBMIT_ARGS = ",os.environ["PYSPARK_SUBMIT_ARGS"],"\n")
print("JAVA_HOME = ", os.environ["JAVA_HOME"])

PYSPARK_SUBMIT_ARGS =  --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5 pyspark-shell 

JAVA_HOME =  /Library/Java/JavaVirtualMachines/jdk1.8.0_31.jdk/Contents/Home


In [4]:
spark = (SparkSession
    .builder
    .appName("StructuredStreamingUCI")
    .getOrCreate())

spark

In [5]:
df = (spark
        .readStream
        .format("kafka")
        .option("kafka.bootstrap.servers", "localhost:9092")
        .option("sep", ",")
        .option("subscribe", "test")
        .load())

In [6]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [7]:
df = df.selectExpr('CAST(value AS STRING)')

df_data = df.select(
        split(df.value, ',')[0].alias("row").cast(StringType()),
        split(df.value, ',')[1].alias("date").cast(StringType()),
        split(df.value, ',')[2].alias("Temperature").cast(DoubleType()),
        split(df.value, ',')[3].alias("Humidity").cast(DoubleType()),
        split(df.value, ',')[4].alias("Light").cast(DoubleType()),
        split(df.value, ',')[5].alias("CO2").cast(DoubleType()),
        split(df.value, ',')[6].alias("HumidityRatio").cast(DoubleType()),
        split(df.value, ',')[7].alias("Occupancy").cast(BooleanType()))

In [8]:
df_data = df_data.withColumn("date", regexp_replace(col("date"), '"', ''))
df_data = df_data.withColumn('date',to_timestamp("date", "yyyy-MM-dd HH:mm:ss"))

In [9]:
df_data = df_data.withColumn("row", regexp_replace(col("row"), '"', ''))
df_data = df_data.withColumn("row",df_data["row"].cast(IntegerType()))

In [10]:
df_data.printSchema()

root
 |-- row: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Humidity: double (nullable = true)
 |-- Light: double (nullable = true)
 |-- CO2: double (nullable = true)
 |-- HumidityRatio: double (nullable = true)
 |-- Occupancy: boolean (nullable = true)



Ejercicio 1: Calcular el promedio de valores de Temperatura, humedad relativa y concentración de CO2 para cada micro-batch y el promedio de dichos valores desde el arranque

In [9]:

result_1_1 = (df_data.agg(
                        avg(col("Temperature")).alias('MB-AVG Temperature'),
                        avg(col("Humidity")).alias('MB-AVG Humidity'),
                        avg(col("CO2")).alias('MB-AVG CO2')
                        )
                     .writeStream
                     .format('console')
                     .trigger(processingTime= '5 seconds')
                     .outputMode("update")
                     .start())
#result_1_1.awaitTermination()


In [35]:
result_1_2 = (df_data.agg(
                        avg(col("Temperature")).alias('AVG Temperature'),
                        avg(col("Humidity")).alias('AVG Humidity'),
                        avg(col("CO2")).alias('AVG CO2')
                        )
                 .writeStream
                 .format('console')
                 .trigger(processingTime= '5 seconds')
                 .outputMode("complete")
                 .start())

Ejercicio 2: Calcular el promedio de luminosidad en la estancia en ventanas deslizantes de tamaño 45
segundos, con un valor de deslizamiento de 15 segundos entre ventanas consecutivas.

In [10]:
result_2 = (df_data.groupBy(window(col("date"), "45 seconds", "15 seconds"))
                   .agg(avg('Light').alias('Light_avg'))
                   .writeStream\
                   .format('console')\
                   .trigger(processingTime= '5 seconds')\
                   .outputMode("update")\
                   .start())
#.avg("Light")

Examinando los datos, podemos apreciar que el intervalo entre muestras originales no es exactamente de 1 minuto en muchos casos. Calcular el número de parejas de muestras consecutivas en cada micro-batch entre las cuales el intervalo de separación no es exactamente de 1 minuto.

In [18]:
df1_ww = (df_data.withColumn("row", col("row") + 1)
              .selectExpr("row AS row_id", "date AS endTime") 
              .withWatermark("endTime", "1 minutes")
          )

df2_ww = (df_data.selectExpr("row", "date AS startTime")       
                 .withWatermark("startTime", "1 minutes")
          )


# Join with event-time constraints
df = (df1_ww
      .join(df2_ww, expr(""" 
                         row_id = row AND 
                         startTime > endTime AND
                         startTime < endTime + interval 1 minutes
                         """))
      .groupBy("startTime")
      .count()
                 .writeStream
                 .format('console')
                 .trigger(processingTime= '5 seconds')
                 .outputMode("append")
                 .start()
)


In [17]:
df.stop()

Opcion con Spark streaming 

In [3]:
packages = "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.5"
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages {0} pyspark-shell".format(packages)
)
print("PYSPARK_SUBMIT_ARGS = ",os.environ["PYSPARK_SUBMIT_ARGS"],"\n")
print("JAVA_HOME = ", os.environ["JAVA_HOME"])

PYSPARK_SUBMIT_ARGS =  --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.5 pyspark-shell 

JAVA_HOME =  /Library/Java/JavaVirtualMachines/jdk1.8.0_31.jdk/Contents/Home


In [4]:
spark = (SparkSession
    .builder
    .appName("StructuredStreamingUCI")
    .getOrCreate())

spark

In [5]:
sc = SparkContext.getOrCreate()
ssc = StreamingContext(sc, 5)

In [6]:
kafkaParams = {"metadata.broker.list": "localhost:9092"}
stream = KafkaUtils.createDirectStream(ssc, ["test"], kafkaParams)
stream = stream.map(lambda o: str(o[1]))

In [7]:

def parseString(line):
      print(line)
      line = line.replace('"', '')
      s = line.split(",")
      try:
          return [{"row": int(s[0]),
                   "date": datetime.strptime(s[1], "%Y-%m-%d %H:%M:%S")}]
      except Exception as err:
          print("Wrong line format (%s): " % line)
          return []

In [9]:
orders = stream.flatMap(parseString)

### Defining the window 
Windowspec = Window.orderBy("date")

### Calculating lag of price at each day level
prev_date= orders.withColumn('lag_date',
                        func.lag(dfu['row'])
                                .over(Windowspec))

### Calculating the average                                  
result = prev_date.withColumn('daily_return', 
          (prev_day_price['price'] - prev_day_price['prev_day_price']) / 
prev_day_price['price'] )

AttributeError: 'TransformedDStream' object has no attribute 'withColumn'

AttributeError: 'TransformedDStream' object has no attribute 'withColumn'

In [11]:
ssc.start()

-------------------------------------------
Time: 2020-07-13 18:29:50
-------------------------------------------
{'row': 907, 'date': datetime.datetime(2015, 2, 5, 8, 57)}
{'row': 908, 'date': datetime.datetime(2015, 2, 5, 8, 57, 59)}
{'row': 909, 'date': datetime.datetime(2015, 2, 5, 8, 58, 59)}
{'row': 910, 'date': datetime.datetime(2015, 2, 5, 9, 0)}
{'row': 911, 'date': datetime.datetime(2015, 2, 5, 9, 1)}
{'row': 912, 'date': datetime.datetime(2015, 2, 5, 9, 2)}
{'row': 913, 'date': datetime.datetime(2015, 2, 5, 9, 3)}
{'row': 914, 'date': datetime.datetime(2015, 2, 5, 9, 4)}
{'row': 915, 'date': datetime.datetime(2015, 2, 5, 9, 4, 59)}
{'row': 916, 'date': datetime.datetime(2015, 2, 5, 9, 6)}
...

-------------------------------------------
Time: 2020-07-13 18:29:55
-------------------------------------------
{'row': 1061, 'date': datetime.datetime(2015, 2, 5, 11, 31)}
{'row': 1062, 'date': datetime.datetime(2015, 2, 5, 11, 31, 59)}
{'row': 1063, 'date': datetime.datetime(2015, 

In [17]:
ssc.stop()

## 5.6 Stop session

In [13]:
spark.stop()

In [19]:
df.stop()