# SNCF - OPEN DATA - API TRANSILIEN - "PROCHAINS DEPARTS"

In [4]:
import os
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [41]:
log4j = sc._jvm.org.apache.log4j
log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)

In [5]:
spark = SparkSession.builder.appName("MS-SIO-HADOOP-PROJECT-KAFKA-CONSUMER").getOrCreate()

In [6]:
spark.conf.set('spark.sql.shuffle.partitions', 5)

In [7]:
topic = "transilien-02"

In [8]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "sandbox-hdp.hortonworks.com:6667") \
    .option("subscribe", topic) \
    .option("startingOffsets", "earliest") \
    .option("", "") \
    .load()

In [9]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [10]:
schema = StructType(
    [
        StructField("station", IntegerType(), True),
        StructField("timestamp", TimestampType(), True),
        StructField("train", StringType(), True)
    ]
)

In [11]:
json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm:ss.sss'Z'"}

In [58]:
df0 = df \
    .select(from_json(col("value").cast("string"), schema, json_options).alias("departure")) \
    .select("departure.*") \
    .select('station', 'train', 'timestamp') \
    .withWatermark('timestamp', '1 minutes') \
    .dropDuplicates(['train', 'timestamp']) \
    .groupBy('station', window('timestamp', '60 minutes', '2 minutes')) \
    .agg(count('train').alias('nt'),  format_number((60. / count('train')), 2).alias('awt')) \
    .withColumn('oma', unix_timestamp(current_timestamp()) - 3720) \
    .withColumn('now', unix_timestamp(current_timestamp()) - 60) \
    .withColumn('wstart', unix_timestamp('window.start')) \
    .withColumn('wend', unix_timestamp('window.end')) \
    .where((col('oma') <= col('wstart')) & (col('wend') <= col('now'))) \
    .select('station', 'window', 'nt', 'awt') \
    .orderBy('station')

In [59]:
df0.printSchema()

root
 |-- station: integer (nullable = true)
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- nt: long (nullable = false)
 |-- awt: string (nullable = true)



In [60]:
q0 = df0 \
    .writeStream \
    .trigger(processingTime='1 minutes') \
    .outputMode('complete') \
    .format('console') \
    .option('truncate', False) \
    .start()

In [62]:
q1 = df0 \
    .writeStream \
    .queryName("averageWaiting") \
    .trigger(processingTime='1 minutes') \
    .outputMode('complete') \
    .format('memory') \
    .start()

In [65]:
spark.sql("SELECT * FROM averageWaiting").show(100, False)

+--------+------------------------------------------+---+-----+
|station |window                                    |nt |awt  |
+--------+------------------------------------------+---+-----+
|87334482|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|12 |5.00 |
|87366922|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|1  |60.00|
|87381111|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|21 |2.86 |
|87381129|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|21 |2.86 |
|87381137|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|31 |1.94 |
|87381459|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|15 |4.00 |
|87381657|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|11 |5.45 |
|87381905|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|13 |4.62 |
|87382002|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|27 |2.22 |
|87382200|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|10 |6.00 |
|87382218|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|21 |2.86 |
|87382259|[2019-02-25 15:40:00, 2019-02-25 16:40:00]|10 |6.00 |
|87382267|[2019-02-25 15:40:00, 2019-02-

In [57]:
q0.stop()
q1.stop()

In [14]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x7fb174138940>]

In [3]:
spark.stop()