In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
import pyspark.sql.functions as f

In [30]:
spark = SparkSession.builder \
    .appName("Credit Events Processor") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.23") \
    .getOrCreate()

In [31]:
import datetime
now = datetime.datetime.now()
# Calcule o lowerBound como 24 horas antes de now
lowerBound_ = now - datetime.timedelta(days=1)
upperBound_ = now
print("lowerBound:", lowerBound_)
print("upperBound:", upperBound_)

lowerBound: 2023-11-15 02:46:21.743090
upperBound: 2023-11-16 02:46:21.743090


In [32]:
# Configurações de acesso ao postgres
host = "172.23.0.1"
port = "5432"
database = "postgres"
url = f"jdbc:postgresql://{host}:{port}/{database}"

df = ( 
    spark.read
         .format("jdbc")
         .options(url=url, driver="org.postgresql.Driver", dbtable="card_events", user="postgres", password="postgres") \
         .options(partitionColumn="event_datetime", lowerBound=lowerBound_, upperBound=upperBound_, numPartitions="30")
         .load()
)

In [33]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- user_id: long (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- event_datetime: timestamp (nullable = true)
 |-- event_unix_time: long (nullable = true)
 |-- category: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- value: decimal(10,2) (nullable = true)
 |-- location: string (nullable = true)
 |-- lon: string (nullable = true)
 |-- lat: string (nullable = true)



In [34]:
df.count()

1273471

In [35]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- user_id: long (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- event_datetime: timestamp (nullable = true)
 |-- event_unix_time: long (nullable = true)
 |-- category: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- value: decimal(10,2) (nullable = true)
 |-- location: string (nullable = true)
 |-- lon: string (nullable = true)
 |-- lat: string (nullable = true)



In [None]:
(
    df
    .write
    .mode("overwrite")
    .parquet("/home/jovyan/work/datalake/card_events")
)

In [None]:
""" Exercício 2:
Coleta em batch por tempo com transformação para escrita com partições:
    1. Escrever no datalake/credit_events os eventos particionados por "event_datetime". 
    2. Criar da coluna tipo string de data: `event_date` para utilizar na função `.partitionBy("event_date").`
    2. Atualizar os dados que serão escritos em D-1, ou seja, sempre inserir os dados das últimas 24h. Deve ser feito um filter("partition_date BETWEEN 'lower_date' AND 'upper_date'")
"""