# SNCF - OPEN DATA - API TRANSILIEN - "PROCHAINS DEPARTS"

In [None]:
import os
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [None]:
log4j = sc._jvm.org.apache.log4j
log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)

In [None]:
spark4kafka = SparkSession.builder.appName("MS-SIO-HADOOP-PROJECT-KAFKA-CONSUMER").getOrCreate()

In [None]:
spark4kafka.conf.set('spark.sql.shuffle.partitions', 5)

In [None]:
topic = "transilien-02"

In [None]:
df = spark4kafka \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "sandbox-hdp.hortonworks.com:6667") \
    .option("subscribe", topic) \
    .option("startingOffsets", "earliest") \
    .load()

In [None]:
df.printSchema()

In [None]:
schema = StructType(
    [
        StructField("station", IntegerType(), True),
        StructField("timestamp", TimestampType(), True),
        StructField("train", StringType(), True)
    ]
)

In [None]:
json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm:ss.sss'Z'"}

In [None]:
df0 = df \
    .select(from_json(col("value").cast("string"), schema, json_options).alias("departure")) \
    .select("departure.*") \
    .select('station', 'train', 'timestamp') \
    .withWatermark('timestamp', '1 minutes') \
    .dropDuplicates(['train', 'timestamp']) \
    .groupBy('station', window('timestamp', '60 minutes', '2 minutes')) \
    .agg(count('train').alias('nt'), format_number((60. / count('train')), 2).cast("double").alias('awt')) \
    .withColumn('oma', unix_timestamp(current_timestamp()) - 3720) \
    .withColumn('now', unix_timestamp(current_timestamp()) - 60) \
    .withColumn('wstart', unix_timestamp('window.start')) \
    .withColumn('wend', unix_timestamp('window.end')) \
    .where((col('oma') <= col('wstart')) & (col('wend') <= col('now'))) \

In [None]:
df0.printSchema()

In [None]:
spark4hive = SparkSession \
    .builder \
    .master("yarn") \
    .appName("MS-SIO-HADOOP-PROJECT-SPARK-SQL") \
    .config("spark.sql.warehouse.dir", "hdfs://sandbox-hdp.hortonworks.com:8020/api-transilien") \
    .config("hive.metastore.uris", "thrift://sandbox-hdp.hortonworks.com:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
db_location = "hdfs://sandbox-hdp.hortonworks.com:8020/api-transilien"

In [None]:
spark4hive.sql(f'create database if not exists transilien location "{db_location}"')

In [None]:
spark4hive.sql('use transilien')

In [None]:
import time

In [None]:
def for_each_micro_batch(df, epoch_id):
    try:
        #start_time = time.time()
        #df.printSchema()
        spark4hive = df
        #spark4hive.printSchema()
        spark4hive.write.mode('overwrite').saveAsTable("averageWaitingTime")
        #elapsed_time = round(time.time() - start_time, 2)
        print(f"updated hive table #{epoch_id}")
    except Exception as e:
        print(f"failed to update hive table with batch #{epoch_id}")
        print(e)

In [None]:
q0 = df0 \
    .select('station', 'window', 'nt', 'awt') \
    .orderBy('station') \
    .writeStream \
    .trigger(processingTime='1 minutes') \
    .outputMode('complete') \
    .format('console') \
    .option('truncate', False) \
    .start()

In [None]:
q1 = df0 \
    .select('station', 'awt') \
    .orderBy('station') \
    .writeStream \
    .trigger(processingTime='1 minutes') \
    .foreachBatch(for_each_micro_batch) \
    .outputMode('complete') \
    .start()

updated hive table #0
updated hive table #1
updated hive table #2
updated hive table #3
updated hive table #4
updated hive table #5
updated hive table #6
updated hive table #7
updated hive table #8
updated hive table #9
updated hive table #10
updated hive table #11
updated hive table #12


In [None]:
spark4kafka.streams.active

In [None]:
q0.stop()
q1.stop()

In [None]:
spark4kafka.stop()
spark4hive.stop()