### Structured Streaming


#### Exemplo de lectura de ficheiros


https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#overview

In [None]:
# Lectura de ficheiros CSV con dous campos: nome;idade

In [None]:
from pyspark.sql.types import StructType

In [None]:
# Definimos o esquema do Dataframe
userSchema = StructType().add("name", "string").add("age", "integer")

# Creamos un Dataframe para ler o fluxo de ficheiros
csvDF = spark \
    .readStream \
    .option("sep", ";") \
    .schema(userSchema) \
    .csv("file:///home/hduser/code/spark/csv")

In [None]:
# Lanzamos a query que executa a lectura sobre o stream de entrada
# Seleccionamos o modo de saída "update", que mostrará unicamente os cambios en cada nova lectura
# format: console -> mostrará os datos por consola
query = csvDF \
    .writeStream \
    .outputMode("update") \
    .format("console") \
    .start()

query.awaitTermination()

In [None]:
# Podemos limitar os restultados mostrados
# Por exemplo: mostrar unicamente os maiores de idade

In [None]:
userSchema = StructType().add("name", "string").add("age", "integer")

csvDF = spark \
    .readStream \
    .option("sep", ";") \
    .schema(userSchema) \
    .csv("file:///home/hduser/code/spark/csv")

df_adultos = csvDF.where("age >= 18")

query = df_adultos \
    .writeStream \
    .outputMode("update") \
    .format("console") \
    .start()

query.awaitTermination()

In [None]:
# Podemos realizar diferentes consultas, non só mostrar os datos

In [None]:
userSchema = StructType().add("name", "string").add("age", "integer")

csvDF = spark \
    .readStream \
    .option("sep", ";") \
    .schema(userSchema) \
    .csv("file:///home/hduser/code/spark/csv")

df_vellos = csvDF.agg({'age':'max'})

query = df_vellos \
    .writeStream \
    .outputMode("update") \
    .format("console") \
    .start()

query.awaitTermination()

In [None]:
# En moitas ocasións será cómodo expresar a consulta con linguaxe SQL

In [None]:
userSchema = StructType().add("name", "string").add("age", "integer")

csvDF = spark \
    .readStream \
    .option("sep", ";") \
    .schema(userSchema) \
    .csv("file:///home/hduser/code/spark/csv")

csvDF.createOrReplaceTempView("people")
#df_sql = spark.sql("SELECT name, MAX(age) FROM people GROUP BY name")
df_sql = spark.sql("SELECT * FROM people WHERE age >=50")

query = df_sql \
    .writeStream \
    .outputMode("update") \
    .format("console") \
    .start()

query.awaitTermination()