In [513]:
import time
import requests
import datetime as dt
from multiprocessing import Process

import findspark
findspark.init()
from pathlib import Path
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, LongType, TimestampType
from pyspark.sql.functions import col, regexp_replace, row_number, when, sum, count, round, date_format, lit, first
from pyspark.sql import SparkSession, Window
findspark.init('/opt/spark')

In [514]:
! apt-get install -y curl

Reading package lists... Done
Building dependency tree       
Reading state information... Done
curl is already the newest version (7.58.0-2ubuntu3.24).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.


In [515]:
! curl -O https://jdbc.postgresql.org/download/postgresql-42.6.0.jar

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1056k  100 1056k    0     0   599k      0  0:00:01  0:00:01 --:--:--  599k


In [516]:
spark = SparkSession.builder.appName(
    "tp9_test1"
).config(
    "spark.jars",
    f"{str(Path.cwd())}/postgresql-42.6.0.jar"
).config(
    "spark.driver.extraClassPath",
    f"{str(Path.cwd())}/postgresql-42.6.0.jar"
).getOrCreate()

In [517]:
! hadoop fs -mkdir -p /user/root/bus/batch

In [518]:
! hadoop fs -mkdir -p /user/root/bus/stream

In [519]:
! hadoop fs -ls /user/root/bus/stream

Found 2 items
drwxr-xr-x   - root supergroup          0 2023-09-16 21:35 /user/root/bus/stream/20230916_183539
drwxr-xr-x   - root supergroup          0 2023-09-16 21:36 /user/root/bus/stream/20230916_183642


In [537]:
! hadoop fs -rm -skipTrash -r /user/root/bus/stream/*

Deleted /user/root/bus/stream/20230916_183828
Deleted /user/root/bus/stream/20230916_183931


In [538]:
def shape(df):
    return (df.count(), len(df.columns))

In [539]:
BASE_URL = "https://dados.mobilidade.rio/gps/sppo?" \
           "dataInicial={dt_inicial}+{hora_inicial}&dataFinal={dt_final}+{hora_final}"

In [540]:
def call_bus_api(interval=60):
    offset = dt.timedelta(hours=3)
    dtf = dt.datetime.now() - offset
    dti = dtf - dt.timedelta(seconds=interval)
    dt_inicial = dti.strftime("%Y-%m-%d")
    dt_final = dtf.strftime("%Y-%m-%d")
    hora_inicial = dti.strftime("%H:%M:%S")
    hora_final = dtf.strftime("%H:%M:%S")
    ret = requests.get(
        BASE_URL.format(
            dt_inicial=dt_inicial, dt_final=dt_final, hora_inicial=hora_inicial, hora_final=hora_final
        )
    )
    rdd = spark.sparkContext.parallelize([ret.text])
    df = spark.read.json(rdd)

    df.write.json(f"/user/root/bus/stream/{dtf.strftime('%Y%m%d_%H%M%S')}")

In [541]:
tables = [
    "traffic_vs_garage"
#     "percent_above_60km",
#     "qty_bus_running_by_company",
#     "qty_bus_running_by_line"
]

In [542]:
def create_table(df):
    for table in tables:
        jdbc_url = "jdbc:postgresql://pg-data:5433/postgres"
        database_table = f"\"{table}\"" 
        properties = {
            "user": "postgres",
            "password": "postgres",
            "driver": "org.postgresql.Driver"
        }

        df.write.jdbc(url=jdbc_url, table=database_table, mode="append", properties=properties)

In [543]:
def calc_traffic(df, dtt):
    window = Window.orderBy("linha")
    df = df.withColumn(
        "idx", row_number().over(window)
    ).withColumn(
        "garage_line", when(col("linha") == "GARAGEM", 1).otherwise(0)
    )
    
    df = df.agg(
        round((sum("garage_line") / count("*") * 100), 2).alias("percent_garage"),
        round(((count("*") - sum("garage_line")) / count("*") * 100), 2).alias("percent_traffic"),
    ).withColumn(
        "hora", lit(dtt).cast(StringType())
    )

    df = df.select(
        "hora", "percent_garage", "percent_traffic"
    )
        
    df.show()
    
    create_table(df)
    

In [544]:
def handle_stream(bdf, ctrl):
    offset = dt.timedelta(hours=3)
    dtt = dt.datetime.now() - offset

    dtt = dtt.strftime('%H:%M:%S')
    calc_traffic(bdf, dtt)
       
#     dest_folder = "/user/root/bus/test.parquet"
#     df.write.parquet(dest_folder)
    

In [545]:
# schema = StructType([
#     StructField("ordem", StringType(), True),
#     StructField("latitude", StringType(), True),
#     StructField("longitude", StringType(), True),
#     StructField("datahora", StringType(), True),
#     StructField("velocidade", StringType(), True),
#     StructField("linha", StringType(), True),
#     StructField("datahoraenvio", StringType(), True),
#     StructField("datahoraservidor", StringType(), True),
# ])

schema = StructType([
    StructField("idx", StringType(), True),
    StructField("linha", StringType(), True),
    StructField("percent_garage", StringType(), True),
    StructField("percent_traffic", StringType(), True),
    StructField("hora", StringType(), True)
])

In [546]:
def loop(max_iter=2):
    idx = 1
    while True:
        print(f"{idx} - fetching API")
        call_bus_api()
        if idx == max_iter:
            break
        time.sleep(60)
        idx += 1

In [547]:
sdf = spark.readStream.schema(schema).json("/user/root/bus/stream/*/*")

In [548]:
stream_handle = sdf.writeStream                         \
                   .outputMode("append")                \
                   .foreachBatch(handle_stream)         \
                   .trigger(processingTime="1 minute") \
                   .start()

In [549]:
stream_handle.isActive

True

In [552]:
stream_handle.status

{'message': 'Terminated with exception: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):\n  File "/opt/spark/python/pyspark/sql/utils.py", line 63, in deco\n    return f(*a, **kw)\n  File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value\n    format(target_id, ".", name), value)\npy4j.protocol.Py4JJavaError: An error occurred while calling o5297.jdbc.\n: org.apache.spark.sql.AnalysisException: Column "hora" not found in schema Some(StructType(StructField(percent_garage,DoubleType,true), StructField(percent_traffic,DoubleType,true)));\n\tat org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$4$$anonfun$6.apply(JdbcUtils.scala:147)\n\tat org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$4$$anonfun$6.apply(JdbcUtils.scala:147)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$4.apply(JdbcUtils.scala:1

In [534]:
stream_handle.lastProgress

{'id': '44d1e108-7f64-4e6b-a1d7-4ca614028968',
 'runId': '91ed356e-fad5-4703-82f2-ae0188d8414e',
 'name': None,
 'timestamp': '2023-09-16T21:38:23.761Z',
 'batchId': 0,
 'numInputRows': 0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'getOffset': 1, 'triggerExecution': 3},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[hdfs://node-master:9000/user/root/bus/stream/*/*]',
   'startOffset': None,
   'endOffset': None,
   'numInputRows': 0,
   'processedRowsPerSecond': 0.0}],
 'sink': {'description': 'ForeachBatchSink'}}

In [462]:
stream_handle.stop()

In [551]:
p = Process(target=loop)
p.start()

1 - fetching API
+--------+--------------+---------------+
|    hora|percent_garage|percent_traffic|
+--------+--------------+---------------+
|18:41:00|           4.4|           95.6|
+--------+--------------+---------------+

2 - fetching API


In [284]:
dest_folder = "/user/root/bus/test.parquet"
df = spark.read.parquet(dest_folder)

# Exiba o DataFrame lido
df.show()


+------+---------+---------+-------------+----------+-----+-------------+----------------+
| ordem| latitude|longitude|     datahora|velocidade|linha|datahoraenvio|datahoraservidor|
+------+---------+---------+-------------+----------+-----+-------------+----------------+
|D13078|-22.87882|-43.46885|1694718086000|         0|  794|1694718101000|   1694718109000|
|C30341|-22.99545|-43.36563|1694718090000|        13|  565|1694718101000|   1694718109000|
|C47464|-22.98373|-43.21799|1694718084000|         0|  553|1694718101000|   1694718109000|
|D13284|-23.01256|-43.31864|1694718060000|        10|  878|1694718101000|   1694718109000|
|C47755|-22.92302|-43.23248|1694718090000|         9|  600|1694718101000|   1694718109000|
|D13310|-22.89362|-43.45436|1694718082000|        17|  794|1694718101000|   1694718109000|
|C47672|-22.97637|-43.41434|1694718083000|         0|  348|1694718101000|   1694718109000|
|C47603|-22.91503|-43.38401|1694718088000|        27|  600|1694718101000|   1694718109000|

In [280]:
dir_batch, dir_stream = "/user/root/onibus/batch/*/*", "/user/root/onibus/stream/*/*"

In [281]:
dfc = spark.read.schema(schema).json(dir_stream)

In [282]:
dfc.show()

+-----+--------+---------+--------+----------+-----+-------------+----------------+
|ordem|latitude|longitude|datahora|velocidade|linha|datahoraenvio|datahoraservidor|
+-----+--------+---------+--------+----------+-----+-------------+----------------+
| null|    null|     null|    null|      null| null|         null|            null|
| null|    null|     null|    null|      null| null|         null|            null|
| null|    null|     null|    null|      null| null|         null|            null|
| null|    null|     null|    null|      null| null|         null|            null|
| null|    null|     null|    null|      null| null|         null|            null|
| null|    null|     null|    null|      null| null|         null|            null|
| null|    null|     null|    null|      null| null|         null|            null|
| null|    null|     null|    null|      null| null|         null|            null|
| null|    null|     null|    null|      null| null|         null|          

In [30]:
shape(dfc)

(2875, 8)

In [31]:
dfc2 = spark.read.csv(dir_batch)

In [32]:
dfc2.show()

+------+---------+---------+-------------+---+------+-------------+-------------+
|   _c0|      _c1|      _c2|          _c3|_c4|   _c5|          _c6|          _c7|
+------+---------+---------+-------------+---+------+-------------+-------------+
|B28555| -22.7981|-43.19024|1694717767000|  0|   329|1694717772000|1694717774000|
|B28738|-22.82059|-43.17902|1694717357000| 16|  2343|1694717772000|1694717774000|
|B28733|  -22.897|-43.18719|1694717766000| 11|  2343|1694717772000|1694717774000|
|B28702|-22.90718| -43.1739|1694717760000|  9|  2344|1694717772000|1694717774000|
|B28738|-22.82079|-43.17818|1694717373000| 23|  2343|1694717772000|1694717774000|
|B28702|-22.90713|-43.17395|1694717766000|  7|  2344|1694717772000|1694717774000|
|B28602|-22.89278|-43.19396|1694717764000| 72|   492|1694717772000|1694717774000|
|B28738| -22.8215|-43.17757|1694717403000| 18|  2343|1694717772000|1694717774000|
|B28738|-22.82335|-43.17601|1694717433000| 28|  2343|1694717772000|1694717774000|
|B28522|-22.7955

In [39]:
shape(dfc2)

(94758, 8)

In [40]:
dfc2.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)

