In [170]:
import time
import requests
import datetime as dt
from multiprocessing import Process

import os
import findspark
import pandas as pd
findspark.init()
from pathlib import Path
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, LongType, TimestampType
from pyspark.sql.functions import col, regexp_replace, row_number, when, sum, count, round, date_format, lit, first
from pyspark.sql import SparkSession, Window
findspark.init('/opt/spark')

In [2]:
! apt-get install -y curl

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libcurl4 libnghttp2-14 librtmp1
The following NEW packages will be installed:
  curl libcurl4 libnghttp2-14 librtmp1
0 upgraded, 4 newly installed, 0 to remove and 0 not upgraded.
Need to get 511 kB of archives.
After this operation, 1399 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libnghttp2-14 amd64 1.30.0-1ubuntu1 [77.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 librtmp1 amd64 2.4+20151223.gitfa8646d.1-1 [54.2 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libcurl4 amd64 7.58.0-2ubuntu3.24 [221 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 curl amd64 7.58.0-2ubuntu3.24 [159 kB]
Fetched 511 kB in 6s (86.1 kB/s)
debconf: delaying package configuration, since apt-utils is not installed
Selecting previously unselected pack

In [3]:
! curl -O https://jdbc.postgresql.org/download/postgresql-42.6.0.jar

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1056k  100 1056k    0     0   327k      0  0:00:03  0:00:03 --:--:--  327k


In [4]:
spark = SparkSession.builder.appName(
    "tp9_teste1"
).config(
    "spark.jars",
    f"{str(Path.cwd())}/postgresql-42.6.0.jar"
).config(
    "spark.driver.extraClassPath",
    f"{str(Path.cwd())}/postgresql-42.6.0.jar"
).getOrCreate()

In [5]:
! hadoop fs -mkdir -p /user/root/bus/batch

In [6]:
! hadoop fs -mkdir -p /user/root/bus/stream

In [171]:
! hadoop fs -ls /user/root/bus/batch/calc_traffic

Found 5 items
drwxr-xr-x   - root supergroup          0 2023-09-17 18:29 /user/root/bus/batch/calc_traffic/20230917_152900
drwxr-xr-x   - root supergroup          0 2023-09-17 18:29 /user/root/bus/batch/calc_traffic/20230917_152920
drwxr-xr-x   - root supergroup          0 2023-09-17 18:29 /user/root/bus/batch/calc_traffic/20230917_152940
drwxr-xr-x   - root supergroup          0 2023-09-17 18:30 /user/root/bus/batch/calc_traffic/20230917_153020
drwxr-xr-x   - root supergroup          0 2023-09-17 18:30 /user/root/bus/batch/calc_traffic/20230917_153040


In [191]:
! hadoop fs -rm -skipTrash -r /user/root/bus/stream/*

Deleted /user/root/bus/stream/20230917_162108
Deleted /user/root/bus/stream/20230917_162135
Deleted /user/root/bus/stream/20230917_162202
Deleted /user/root/bus/stream/20230917_162229
Deleted /user/root/bus/stream/20230917_162256


In [176]:
def shape(df):
    return (df.count(), len(df.columns))

In [177]:
BASE_URL = "https://dados.mobilidade.rio/gps/sppo?" \
           "dataInicial={dt_inicial}+{hora_inicial}&dataFinal={dt_final}+{hora_final}"

In [178]:
def call_bus_api(interval=20):
    offset = dt.timedelta(hours=3)
    dtf = dt.datetime.now() - offset
    dti = dtf - dt.timedelta(seconds=interval)
    dt_inicial = dti.strftime("%Y-%m-%d")
    dt_final = dtf.strftime("%Y-%m-%d")
    hora_inicial = dti.strftime("%H:%M:%S")
    hora_final = dtf.strftime("%H:%M:%S")
    ret = requests.get(
        BASE_URL.format(
            dt_inicial=dt_inicial, dt_final=dt_final, hora_inicial=hora_inicial, hora_final=hora_final
        )
    )
    rdd = spark.sparkContext.parallelize([ret.text])
    df = spark.read.json(rdd)
    df = df.withColumn(
        "hora", lit(dtf.strftime('%H:%M:%S'))
    )
    df.write.json(f"/user/root/bus/stream/{dtf.strftime('%Y%m%d_%H%M%S')}")

In [179]:
tables = [
    "traffic_vs_garage"
#     "percent_above_60km",
#     "qty_bus_running_by_company",
#     "qty_bus_running_by_line"
]

In [180]:
def create_table(df):
    for table in tables:
        jdbc_url = "jdbc:postgresql://pg-data:5433/postgres"
        database_table = f"\"{table}\"" 
        properties = {
            "user": "postgres",
            "password": "postgres",
            "driver": "org.postgresql.Driver"
        }

        df.write.jdbc(url=jdbc_url, table=database_table, mode="append", properties=properties)

In [194]:
def calc_traffic(df, dtt):
    df = df.withColumn(
        "is_garage", when(col("linha") == "GARAGEM", 1).otherwise(0)
    )

    df = df.groupBy("hora").agg(
        round((sum("is_garage") / count("*") * 100), 2).alias("percent_garage"), 
        round(((count("*") - sum("is_garage")) / count("*") * 100), 2).alias("percent_traffic")
    )

#     df = df.select(
#         round((sum("is_garage") / count("*") * 100), 2).alias("percent_garage"), 
#         round(((count("*") - sum("is_garage")) / count("*") * 100), 2).alias("percent_traffic")
#     )
        
    df.show()
    
    create_table(df)
    
    df.write.parquet(f"/user/root/bus/batch/calc_traffic/{dtt}")
    

In [195]:
def handle_stream(bdf, ctrl):
    offset = dt.timedelta(hours=3)
    dtt = dt.datetime.now() - offset

    dtt = dtt.strftime('%Y%m%d_%H%M%S')
    calc_traffic(bdf, dtt)
        

In [196]:
schema = StructType([
    StructField("linha", StringType(), True),
    StructField("hora", StringType(), True),
    StructField("percent_garage", StringType(), True),
    StructField("percent_traffic", StringType(), True),
])

In [197]:
def loop(max_iter=5):
    idx = 1
    while True:
        print(f"{idx} - fetching API")
        call_bus_api()
        if idx == max_iter:
            break
        time.sleep(25)
        idx += 1

In [198]:
sdf = spark.readStream.schema(schema).json("/user/root/bus/stream/*")

In [199]:
stream_handle = sdf.writeStream                         \
                   .outputMode("append")                \
                   .foreachBatch(handle_stream)         \
                   .trigger(processingTime="20 seconds") \
                   .start()

In [200]:
stream_handle.isActive

True

In [201]:
stream_handle.status

{'message': 'Waiting for next trigger',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [202]:
stream_handle.lastProgress

{'id': '5b3fe05d-ac86-4adb-884d-80e62b5b0133',
 'runId': '1cd0f46c-d35f-4938-9835-5be339bbba01',
 'name': None,
 'timestamp': '2023-09-17T19:25:32.967Z',
 'batchId': 0,
 'numInputRows': 0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'getOffset': 1, 'triggerExecution': 2},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[hdfs://node-master:9000/user/root/bus/stream/*]',
   'startOffset': None,
   'endOffset': None,
   'numInputRows': 0,
   'processedRowsPerSecond': 0.0}],
 'sink': {'description': 'ForeachBatchSink'}}

In [203]:
p = Process(target=loop)
p.start()

1 - fetching API
+--------+--------------+---------------+
|    hora|percent_garage|percent_traffic|
+--------+--------------+---------------+
|16:25:34|         15.25|          84.75|
+--------+--------------+---------------+

2 - fetching API
3 - fetching API
4 - fetching API
5 - fetching API


In [192]:
p.terminate()

In [193]:
stream_handle.stop()

In [169]:
diretorio = "/user/root/bus/batch/calc_traffic"

lista_arquivos = os.listdir(diretorio)

for 
dest_folder = "/user/root/bus/batch/calc_traffic/20230917_152900"
df = spark.read.parquet(dest_folder)

# Exiba o DataFrame lido
df.show()


+--------------+---------------+
|percent_garage|percent_traffic|
+--------------+---------------+
|         11.47|          88.53|
+--------------+---------------+



In [166]:
dir_batch, dir_stream = "/user/root/onibus/batch/*/*", "/user/root/onibus/stream/*/*"

In [167]:
dfc = spark.read.schema(schema).json(dir_stream)

AnalysisException: 'Path does not exist: hdfs://node-master:9000/user/root/onibus/stream/*/*;'

In [168]:
dfc.show()

NameError: name 'dfc' is not defined

In [30]:
shape(dfc)

(2875, 8)

In [31]:
dfc2 = spark.read.csv(dir_batch)

In [32]:
dfc2.show()

+------+---------+---------+-------------+---+------+-------------+-------------+
|   _c0|      _c1|      _c2|          _c3|_c4|   _c5|          _c6|          _c7|
+------+---------+---------+-------------+---+------+-------------+-------------+
|B28555| -22.7981|-43.19024|1694717767000|  0|   329|1694717772000|1694717774000|
|B28738|-22.82059|-43.17902|1694717357000| 16|  2343|1694717772000|1694717774000|
|B28733|  -22.897|-43.18719|1694717766000| 11|  2343|1694717772000|1694717774000|
|B28702|-22.90718| -43.1739|1694717760000|  9|  2344|1694717772000|1694717774000|
|B28738|-22.82079|-43.17818|1694717373000| 23|  2343|1694717772000|1694717774000|
|B28702|-22.90713|-43.17395|1694717766000|  7|  2344|1694717772000|1694717774000|
|B28602|-22.89278|-43.19396|1694717764000| 72|   492|1694717772000|1694717774000|
|B28738| -22.8215|-43.17757|1694717403000| 18|  2343|1694717772000|1694717774000|
|B28738|-22.82335|-43.17601|1694717433000| 28|  2343|1694717772000|1694717774000|
|B28522|-22.7955

In [39]:
shape(dfc2)

(94758, 8)

In [40]:
dfc2.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)

