In [1]:
import requests
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
import json
from delta import DeltaTable

spark = (SparkSession.builder
             .appName('lab') # Name the app
             .config("hive.metastore.uris", "thrift://metastore:9083") # Set external Hive Metastore
             .config("hive.metastore.schema.verification", "false") # Prevent some errors
             .config("spark.sql.repl.eagerEval.enabled", True)
             .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
             .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
             .enableHiveSupport()
             .getOrCreate())

In [4]:
def autenticar():
    # endpoint da API para autenticar
    api_url = "https://api.olhovivo.sptrans.com.br/v2.1/Login/Autenticar?token=fa1ae741481d20625673b2020fdd07bcfdcf5d60f27d226a069812a94de3edd0"

    # Inicialize uma sessão do requests
    session = requests.Session()

    try:
        # Faz a requisição post para a API usando a sessão
        response = session.post(api_url)

        # Verifique se a requisição foi bem-sucedida (código de status 200)
        if response.status_code == 200:
            # Extraia o conteúdo JSON (ou texto) da resposta
            data = response.json()  # ou response.text se for texto
          
            return session                
        else:
            print(f"Falha ao acessar a API. Status Code: {session.post(api_url).status_code}")      
            
    finally:
        # Feche a sessão após o uso
        session.close()

def callAPIGet(api_url, session):
    try:    
        # Verifique se a requisição foi bem-sucedida (código de status 200)
        if session.get(api_url).status_code == 200:
            # Extraia o conteúdo JSON (ou texto) da resposta
            data = session.get(api_url).json()  # ou response.text se for texto
            
            if isinstance(data, list):
                return data
            elif isinstance(data, dict):
                return [data]
            
            return None                  
        else:
            print(f"Falha ao acessar a API. Status Code: {session.post(api_url).status_code}")      
            
    finally:
        # Feche a sessão após o uso
        session.close()

def obterPrevisaoParada(stop_id, session):
    while True:
        r = session.get(f"https://api.olhovivo.sptrans.com.br/v2.1/Previsao/Parada?codigoParada={stop_id}")

        if r.status_code == 200:
            return {"stop_id": stop_id, **r.json()}

        elif r.status_code == 429:
            time.sleep(60)
            
        else:
            return None

In [5]:
previsao_schema = StructType([
    StructField("stop_id", StringType(), True),
    StructField("hr", StringType(), True),
    StructField("p", StructType([
        StructField("cp", LongType(), True),
        StructField("np", StringType(), True),
        StructField("py", FloatType(), True),
        StructField("px", FloatType(), True),
        StructField("l", ArrayType(StructType([
            StructField("c", StringType(), True),
            StructField("cl", IntegerType(), True),
            StructField("sl", IntegerType(), True),
            StructField("lt0", StringType(), True),
            StructField("lt1", StringType(), True),
            StructField("qv", IntegerType(), True),
            StructField("vs", ArrayType(StructType([
                StructField("p", StringType(), True),
                StructField("a", BooleanType(), True),
                StructField("ta", StringType(), True),
                StructField("py", FloatType(), True),
                StructField("px", FloatType(), True)
            ])), True),
        ])), True)
    ]), True)
])

In [34]:
session = autenticar()


In [8]:
paradas.filter("stop_desc is not null").count()

20352

In [35]:
# paradas = spark.read.csv("s3a://raw/GTFS/paradas/*.txt", header=True)
# paradas_rdd = paradas.filter("stop_desc is not null").select("stop_id").rdd
# previsao = paradas_rdd.map(lambda x: x.asDict()["stop_id"]).map(lambda x: obterPrevisaoParada(x, session)).map(lambda x: Row(data=json.dumps(x)))
# previsao_df = spark.createDataFrame(previsao, StructType([StructField("data", StringType(), True)]))
# previsao_df.write.mode("overwrite").parquet("s3a://raw/temp/test_paradas/")

In [40]:
previsao_df = spark.read.parquet("s3a://raw/temp/test_paradas/")
print(previsao_df.count())

20352


In [46]:
aa = (previsao_df
          .select(
              from_json("data", previsao_schema).alias("dd")
          ).select("dd.*")
     ) #260016860

aa.filter("stop_id = '260016860'").show(truncate=False)

+---------+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|stop_id  |hr   |p              

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 50158)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/

In [14]:
paradas = spark.read.csv("s3a://raw/GTFS/paradas/*.txt", header=True)
paradas_rdd = paradas.filter("stop_desc is not null").select("stop_id").rdd
previsao = paradas_rdd.map(lambda x: x.asDict()["stop_id"]).map(lambda x: obterPrevisaoParada(x, session))
previsao_df = spark.createDataFrame(previsao, previsao_schema).cache()
print(previsao_df.count())

10000


In [15]:
previsao_df.show()

+-------+-----+--------------------+
|stop_id|   hr|                   p|
+-------+-----+--------------------+
|  19073|20:48|                null|
| 104763|20:48|                null|
| 105268|20:48|{105268, , -23.57...|
| 105368|20:48|{105368, , -23.57...|
| 105388|20:48|{105388, , -23.57...|
| 105424|20:48|{105424, , -23.57...|
| 105431|20:48|{105431, , -23.56...|
| 105432|20:48|{105432, , -23.56...|
| 109574|20:48|{109574, , -23.55...|
| 109576|20:48|{109576, , -23.55...|
| 109594|20:48|                null|
| 109595|20:48|{109595, , -23.56...|
| 109731|20:48|{109731, , -23.56...|
| 109733|20:48|{109733, , -23.55...|
| 109738|20:48|{109738, , -23.56...|
| 109739|20:48|                null|
| 109740|20:48|                null|
| 109741|20:48|{109741, , -23.56...|
| 109751|20:48|{109751, , -23.57...|
| 109754|20:48|                null|
+-------+-----+--------------------+
only showing top 20 rows



In [6]:
spark.sql("create external table stage.teste2 (id string) using delta location 's3a://stage/teste/' ").show()

++
||
++
++



In [14]:
spark.sql("show tables from stage").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|    stage| corredor|      false|
|    stage|    teste|      false|
+---------+---------+-----------+



In [29]:
a = spark.createDataFrame([{"id": "aa"}])

a.show()

+---+
| id|
+---+
| aa|
+---+



In [32]:
spark.sql("drop table stage.teste_external").show()

++
||
++
++



----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 42990)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/

In [30]:
try:
    a.write.format("delta").saveAsTable("stage.teste")
except:
    pass

In [4]:
from datetime import datetime

datetime.now().strftime("%Y-%m-%d")

'2024-09-19'

In [3]:
df_posicao = spark.read.json(f"s3a://raw/olhovivo/posicao/dt=2024-09-21/")
df_posicao

hr,l
17:17,"[{2702-10, 33682,..."
18:07,"[{2702-10, 914, M..."
18:12,"[{2702-10, 914, M..."
18:15,"[{2702-10, 914, M..."
18:13,"[{2702-10, 914, M..."
18:14,"[{2702-10, 914, M..."


In [4]:
a = df_posicao.select(explode("l").alias("l")).select("l.*")
a.show()

+-------+-----+--------------------+-------------------+---+---+--------------------+
|      c|   cl|                 lt0|                lt1| qv| sl|                  vs|
+-------+-----+--------------------+-------------------+---+---+--------------------+
|2702-10|33682|   METRÔ ARTUR ALVIM|      VL. AMERICANA|  5|  2|[{true, 36281, -4...|
|6016-41| 1706|        TERM. GRAJAÚ|    JD. PORTO VELHO|  2|  1|[{true, 66789, -4...|
|307C-10|34882|   METRÔ ARTUR ALVIM|CONJ. ENCOSTA NORTE|  4|  2|[{true, 32139, -4...|
|5757-10| 1288|     METRÔ CONCEIÇÃO|         CID. JÚLIA|  5|  1|[{true, 68054, -4...|
|6036-10| 1214|     TERM. CAPELINHA|      JD. MACEDÔNIA|  5|  1|[{true, 78578, -4...|
|5010-10| 1114|          STO. AMARO|          JABAQUARA|  2|  1|[{true, 68033, -4...|
|5106-31|34073|      METRÔ ANA ROSA|          JD. SELMA|  3|  2|[{true, 63594, -4...|
|6026-10|   87|    TERM. STO. AMARO|         JD. ICARAÍ|  3|  1|[{true, 61269, -4...|
|527R-10|  271|    TERM. STO. AMARO|        VL. IMPÉRI

In [None]:
window_spec = Window.partitionBy("Empresa.a").orderBy(col("Hora").desc())
df_empresa_tratado = df_empresa.withColumn("row_num", row_number().over(window_spec)) \
                       .filter(col("row_num") == 1) \
                       .drop("row_num")