# SNCF - OPEN DATA - API TRANSILIEN - "PROCHAINS DEPARTS"

In [None]:
import os
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [None]:
log4j = sc._jvm.org.apache.log4j
log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)

In [None]:
spark = SparkSession.builder.appName("MS-SIO-HADOOP-PROJECT-KAFKA-CONSUMER").getOrCreate()

In [None]:
spark.conf.set('spark.sql.shuffle.partitions', 5)

In [None]:
topic = "transilien-02"

In [None]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "sandbox-hdp.hortonworks.com:6667") \
    .option("subscribe", topic) \
    .option("startingOffsets", "earliest") \
    .load()

In [None]:
df.printSchema()

In [None]:
schema = StructType(
    [
        StructField("station", IntegerType(), True),
        StructField("timestamp", TimestampType(), True),
        StructField("train", StringType(), True)
    ]
)

In [None]:
json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm:ss.sss'Z'"}

In [None]:
df0 = df \
    .select(from_json(col("value").cast("string"), schema, json_options).alias("departure")) \
    .select("departure.*") \
    .select('station', 'train', 'timestamp') \
    .withWatermark('timestamp', '1 minutes') \
    .dropDuplicates(['train', 'timestamp']) \
    .groupBy('station', window('timestamp', '60 minutes', '2 minutes')) \
    .agg(count('train').alias('nt'),  format_number((60. / count('train')), 2).alias('awt')) \
    .withColumn('oma', unix_timestamp(current_timestamp()) - 3720) \
    .withColumn('now', unix_timestamp(current_timestamp()) - 60) \
    .withColumn('wstart', unix_timestamp('window.start')) \
    .withColumn('wend', unix_timestamp('window.end')) \
    .where((col('oma') <= col('wstart')) & (col('wend') <= col('now'))) \

In [None]:
df0.printSchema()

In [None]:
spark2hive = SparkSession \
    .builder \
    .master("yarn") \
    .appName("MS-SIO-HADOOP-PROJECT-SPARK-SQL") \
    .config("spark.sql.warehouse.dir", "hdfs://sandbox-hdp.hortonworks.com:8020/api-transilien") \
    .config("hive.metastore.uris", "thrift://sandbox-hdp.hortonworks.com:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
spark2hive.sql('USE transilien')

In [None]:
def for_each_micro_batch(df, epoch_id):
    try:
        print(f"doing something with micro-batch #{epoch_id}")
        spark2hive = df
        spark2hive.write.mode('overwrite').saveAsTable("averageWaitingTime")
    except Exception as e:
        print(f"failed to do something with batch #{epoch_id}")
        print(e)

In [None]:
q0 = df0 \
    .select('station', 'window', 'nt', 'awt') \
    .orderBy('station') \
    .writeStream \
    .trigger(processingTime='1 minutes') \
    .outputMode('complete') \
    .format('console') \
    .option('truncate', False) \
    .start()

In [None]:
q1 = df0 \
    .select('station', 'awt') \
    .writeStream \
    .queryName("averageWaitingTime") \
    .foreachBatch(for_each_micro_batch) \
    .outputMode('complete') \
    .start()

doing something with micro-batch #91
doing something with micro-batch #92
doing something with micro-batch #93
doing something with micro-batch #94
doing something with micro-batch #95
doing something with micro-batch #96
doing something with micro-batch #97
doing something with micro-batch #98
doing something with micro-batch #99
doing something with micro-batch #100
doing something with micro-batch #101
doing something with micro-batch #102
doing something with micro-batch #103
doing something with micro-batch #104
doing something with micro-batch #105
doing something with micro-batch #106
doing something with micro-batch #107
doing something with micro-batch #108
doing something with micro-batch #109
doing something with micro-batch #110
doing something with micro-batch #111
doing something with micro-batch #112
doing something with micro-batch #113
doing something with micro-batch #114
doing something with micro-batch #115
doing something with micro-batch #116
doing something with 

In [None]:
q0.stop()
q1.stop()

In [None]:
spark.streams.active

In [None]:
spark.stop()
spark2hive.stop()