# Gerar Silver

In [1]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [2]:
import ipywidgets as widgets
import matplotlib.pyplot as plt
import pyspark.pandas as ps
import seaborn as sns

from delta import *

from matplotlib import dates
from pyspark.sql.avro.functions import *
from pyspark.sql.functions import col, to_date, date_format, current_timestamp
from pyspark.sql.types import StringType, DateType, StructType, DoubleType, IntegerType, LongType, TimestampType
#
# Nome da aplicação Spark
#
APP_NAME="GerarBronze"
ps.options.display.max_rows = 10

In [3]:
%run StartSpark.ipynb

In [4]:
%config SqlMagic.lazy_execution = True

In [5]:
%sql spark

# FK

In [6]:
from urllib.parse import urlparse

def delta_exists(delta_path, tier, table):
    url = urlparse(delta_path)

    match url:
        case "abfss":
            print("Blob Storage")
        case "s3":
                pass
        case "s3a":
            print("S3 Compatible")
    

In [7]:
from urllib.parse import urlparse

def delta_exists(delta_path, tier, table):
  """
  Checks if the provided delta path points to a supported storage type.

  Args:
      delta_path (str): The URL or path to the delta data.
      tier (str): Optional tier information (may not be used).
      table (str): Optional table name (may not be used).

  Returns:
      str: A string indicating the storage type ("Blob Storage" or "S3 Compatible")
          or None if the storage type is not supported.
  """

  url = urlparse(delta_path)
  match url.scheme:
      case "abfss":
          return "Blob Storage"
      case "s3" | "s3a":  # Combine S3 and S3A cases for efficiency
          return "S3 Compatible"
      case _:
          return None  # Return None for unsupported schemes

  # Unreachable code, but included for clarity
  # return None  # Redundant return statement here

In [8]:
def table_path(bucket:str, tier:str, table_name:str, storage:str="s3a", base_dir:str="lakehouse"):
    # if len(tier.strip()) == 0:
    #     raise ValueError("Tier cannot be empty")

    path = f"{storage}://{bucket}/{base_dir}/{tier}/{table_name}"
    return path, path + "/_checkpoint/"

In [9]:
stock_bronze, stock_bronze_checkpoint_dir = table_path("nemesys-demo1", "bronze", "stocks_intraday")
stock_silver, stock_silver_checkpoint_dir = table_path("nemesys-demo1", "silver", "stocks_intraday")
stock_dup_silver = stock_silver + "_dup"
stock_dup_silver_checkpoint_dir = stock_dup_silver + "/_checkpoint/"

In [10]:
print(stock_bronze, stock_bronze_checkpoint_dir)
print(stock_silver, stock_silver_checkpoint_dir)
print(stock_dup_silver, stock_dup_silver_checkpoint_dir)

s3a://nemesys-demo1/lakehouse/bronze/stocks_intraday s3a://nemesys-demo1/lakehouse/bronze/stocks_intraday/_checkpoint/
s3a://nemesys-demo1/lakehouse/silver/stocks_intraday s3a://nemesys-demo1/lakehouse/silver/stocks_intraday/_checkpoint/
s3a://nemesys-demo1/lakehouse/silver/stocks_intraday_dup s3a://nemesys-demo1/lakehouse/silver/stocks_intraday_dup/_checkpoint/


In [11]:
# spark.sql(f"""
# create table stock_intraday (
#     ticker string,
#     timestamp string,
#     open double,
#     high double,
#     low double,
#     close double,
#     volume long
# )
# using delta location '{stock_dup_silver}'
# """)

In [12]:
if not DeltaTable.isDeltaTable(spark, stock_silver):
    print("Criar tabela")
    schema = (StructType()
        .add("ticker", StringType())
        .add('ano', IntegerType())
        .add("timestamp", TimestampType())
        .add("open", DoubleType())
        .add("high", DoubleType())
        .add("low", DoubleType())
        .add("close", DoubleType())
        .add("volume", LongType())
        .add("_capture_time", TimestampType())
        .add("_capture_time_silver", TimestampType())
    )
    emptyDF = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
    emptyDF.write.format('delta').mode('overwrite').partitionBy("ticker", "ano").save(stock_silver)

deltaTable = DeltaTable.forPath(spark, stock_silver)

Criar tabela


In [13]:
%%time
(spark
    .readStream
    .format("delta")
    .option('startingOffsets', 'earliest')
    .load(stock_bronze)
    .withColumn("_capture_time_silver", current_timestamp())
    .writeStream
    .format('delta')
    .outputMode('append')
    .option('mergeSchema', 'true')
    .option('checkpointLocation', stock_bronze_checkpoint_dir + "silver_dup")
    .trigger(once=True)
    .start(stock_dup_silver)
    .awaitTermination()
)

CPU times: user 16.4 ms, sys: 0 ns, total: 16.4 ms
Wall time: 25.9 s


In [14]:
def config_upsert(delta):
    def upsertToDelta(microbatchdf, batchId):
        print(f'Batch {batchId} com {microbatchdf.count()} linhas')
        # Verificar e remover duplicatas no microbatch
        microbatchdf_clean = microbatchdf.dropDuplicates(["ticker", "timestamp"])
        
        # Garantir que os campos estejam no mesmo formato
        microbatchdf_clean = microbatchdf_clean.withColumn("ticker", col("ticker").cast("string"))
        microbatchdf_clean = microbatchdf_clean.withColumn("timestamp", col("timestamp").cast("timestamp"))
        
        delta.alias("t").merge(
          microbatchdf_clean.alias("s"),
          "s.ticker = t.ticker and s.timestamp = t.timestamp") \
        .whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll() \
        .execute()
        print(f'Exportadas {microbatchdf_clean.count()} linhas')
        
    return upsertToDelta

In [15]:
%%time
(spark
    .readStream
    .format("delta")
    .option('startingOffsets', 'earliest')
    .load(stock_bronze)
    .withColumn('ano', date_format('timestamp', 'yyyy').cast(IntegerType()))
    .withColumn("_capture_time_silver", current_timestamp())
    .writeStream
    .format('delta')
    .foreachBatch(config_upsert(deltaTable))
    .outputMode('update')
    .option('checkpointLocation', stock_bronze_checkpoint_dir + "silver")
    .trigger(once=True)
    .start()
    .awaitTermination()
)

Batch 0 com 4397528 linhas
Exportadas 11148 linhas
CPU times: user 48 ms, sys: 27.8 ms, total: 75.8 ms
Wall time: 1min 47s


## Avaliar processo

In [16]:
df = spark.read.format("delta").load(stock_silver)
df_dup = spark.read.format("delta").load(stock_dup_silver)

In [17]:
%%time
df.count()

CPU times: user 3.18 ms, sys: 0 ns, total: 3.18 ms
Wall time: 2.79 s


11148

In [18]:
%%time
df_dup.count()

CPU times: user 0 ns, sys: 2.89 ms, total: 2.89 ms
Wall time: 2.59 s


4397528

In [19]:
%%time
df.dropDuplicates(["ticker", "timestamp"]).count()

CPU times: user 3.54 ms, sys: 0 ns, total: 3.54 ms
Wall time: 948 ms


11148

In [20]:
df.printSchema()

root
 |-- ticker: string (nullable = true)
 |-- ano: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: long (nullable = true)
 |-- _capture_time: timestamp (nullable = true)
 |-- _capture_time_silver: timestamp (nullable = true)



In [21]:
df_dup.printSchema()

root
 |-- ticker: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: long (nullable = true)
 |-- _capture_time: timestamp (nullable = true)
 |-- _capture_time_silver: timestamp (nullable = true)



In [22]:
%%time
df_dup.where("timestamp > '2024-06-01T10:30:00-03:00'").sort("ticker","timestamp").pandas_api()

CPU times: user 15 ms, sys: 2.34 ms, total: 17.3 ms
Wall time: 74.2 ms


Unnamed: 0,ticker,timestamp,open,high,low,close,volume,_capture_time,_capture_time_silver
0,AAPL,2024-06-13T13:30:00Z,214.720001,215.949997,214.580002,215.229996,52886,2024-06-13 18:03:04.270,2024-06-23 17:40:32.034
1,AAPL,2024-06-13T13:30:00Z,214.720001,215.949997,214.580002,215.229996,52886,2024-06-13 18:04:05.380,2024-06-23 17:40:32.034
2,AAPL,2024-06-13T13:30:00Z,214.720001,215.949997,214.580002,215.229996,52886,2024-06-13 18:05:03.887,2024-06-23 17:40:32.034
3,AAPL,2024-06-13T13:30:00Z,214.720001,215.949997,214.580002,215.229996,52886,2024-06-13 18:06:03.643,2024-06-23 17:40:32.034
4,AAPL,2024-06-13T13:30:00Z,214.720001,215.949997,214.580002,215.229996,52886,2024-06-13 18:07:03.378,2024-06-23 17:40:32.034
5,AAPL,2024-06-13T13:30:00Z,214.720001,215.949997,214.580002,215.229996,52886,2024-06-13 18:08:03.947,2024-06-23 17:40:32.034
6,AAPL,2024-06-13T13:30:00Z,214.720001,215.949997,214.580002,215.229996,52886,2024-06-13 18:09:04.394,2024-06-23 17:40:32.034
7,AAPL,2024-06-13T13:30:00Z,214.720001,215.949997,214.580002,215.229996,52886,2024-06-13 18:10:04.720,2024-06-23 17:40:32.034
8,AAPL,2024-06-13T13:30:00Z,214.720001,215.949997,214.580002,215.229996,52886,2024-06-13 18:11:04.350,2024-06-23 17:40:32.034
9,AAPL,2024-06-13T13:30:00Z,214.720001,215.949997,214.580002,215.229996,52886,2024-06-13 18:12:05.015,2024-06-23 17:40:32.034


# Otimizar Camada Bronze

In [23]:
# deltaTable = DeltaTable.forPath(spark, stock_bronze)
# df = deltaTable.optimize().executeCompaction()

In [24]:
# df.show()

In [25]:
# spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "False")
# deltaTable.vacuum(0)