# Gerar Silver

In [1]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [2]:
import ipywidgets as widgets
import matplotlib.pyplot as plt
import pyspark.pandas as ps
import seaborn as sns

from delta import *

from matplotlib import dates
from pyspark.sql.avro.functions import *
from pyspark.sql.functions import col, to_date, date_format, current_timestamp, lag, to_timestamp, when, lit
from pyspark.sql.types import StringType, DateType, StructType, DoubleType, IntegerType, LongType, TimestampType
from pyspark.sql.window import Window
#
# Nome da aplicação Spark
#
APP_NAME="GerarBronze"
ps.options.display.max_rows = 10

In [3]:
%run StartSpark.ipynb

In [4]:
%config SqlMagic.lazy_execution = True

In [5]:
%sql spark

In [6]:
def table_path(bucket:str, tier:str, table_name:str, storage:str="s3a", base_dir:str="lakehouse"):
    # if len(tier.strip()) == 0:
    #     raise ValueError("Tier cannot be empty")

    path = f"{storage}://{bucket}/{base_dir}/{tier}/{table_name}"
    return path, path + "/_checkpoint/"

In [7]:
stock_silver, stock_silver_checkpoint_dir = table_path("nemesys-demo1", "silver", "stocks_intraday")
stock_gold, stock_gold_checkpoint_dir = table_path("nemesys-demo1", "gold", "stocks_intraday")
print(stock_silver, stock_silver_checkpoint_dir)
print(stock_gold, stock_gold_checkpoint_dir)

s3a://nemesys-demo1/lakehouse/silver/stocks_intraday s3a://nemesys-demo1/lakehouse/silver/stocks_intraday/_checkpoint/
s3a://nemesys-demo1/lakehouse/gold/stocks_intraday s3a://nemesys-demo1/lakehouse/gold/stocks_intraday/_checkpoint/


In [8]:
df = spark.read.format("delta").load(stock_silver)

In [9]:
df.count()

27790

In [10]:
df.createOrReplaceTempView("silver_stocks")

In [11]:
df_stocks = spark.sql("""
select
    ticker,
    ano,
    timestamp,
    open,
    high,
    low,
    close,
    volume,
    coalesce((close - LAG(close,1) OVER (PARTITION BY ticker ORDER BY timestamp)), 0) AS osc,
    coalesce((osc * 100.0 / LAG(close,1) OVER (PARTITION BY ticker ORDER BY timestamp)), 0) as osc_per,
    _capture_time_kafka,
    _capture_time_bronze,
    _capture_time_silver,
    current_timestamp as _capture_time_gold,
    unix_timestamp(_capture_time_bronze) - unix_timestamp(_capture_time_kafka) as _delta_kafka_bronze,
    unix_timestamp(_capture_time_silver) - unix_timestamp(_capture_time_bronze) as _delta_bronze_silver,
    unix_timestamp(_capture_time_gold) - unix_timestamp(_capture_time_silver) as _delta_silver_gold
from silver_stocks
""")

In [12]:
if not DeltaTable.isDeltaTable(spark, stock_gold):
    print("Criar tabela")
    schema = (StructType()
        .add("ticker", StringType())
        .add('ano', IntegerType())
        .add("timestamp", TimestampType())
        .add("open", DoubleType())
        .add("high", DoubleType())
        .add("low", DoubleType())
        .add("close", DoubleType())
        .add("volume", LongType())
        .add("osc", DoubleType())
        .add("osc_per", DoubleType())
        .add("_capture_time_kafka", TimestampType())
        .add("_capture_time_bronze", TimestampType())
        .add("_capture_time_silver", TimestampType())
        .add("_capture_time_gold", TimestampType())
        .add('_delta_kafka_bronze', IntegerType())
        .add('_delta_bronze_silver', IntegerType())
        .add('_delta_silver_gold', IntegerType())
    )
    emptyDF = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
    emptyDF.write.format('delta').mode('overwrite').partitionBy("ticker", "ano").save(stock_gold)

deltaTable = DeltaTable.forPath(spark, stock_gold)

Criar tabela


In [13]:
%%time
deltaTable.alias("t").merge(
          df_stocks.alias("s"),
          "s.ticker = t.ticker and s.timestamp = t.timestamp") \
        .whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll() \
        .execute()

CPU times: user 5.54 ms, sys: 5.33 ms, total: 10.9 ms
Wall time: 12.1 s


In [14]:
df_teste = spark.read.format("delta").load(stock_gold)

In [15]:
%%time
df_teste.count()

CPU times: user 0 ns, sys: 2.84 ms, total: 2.84 ms
Wall time: 2.38 s


27790

In [16]:
%%time
df_teste.pandas_api()

CPU times: user 16.4 ms, sys: 2.2 ms, total: 18.6 ms
Wall time: 76.8 ms


Unnamed: 0,ticker,ano,timestamp,open,high,low,close,volume,osc,osc_per,_capture_time_kafka,_capture_time_bronze,_capture_time_silver,_capture_time_gold,_delta_kafka_bronze,_delta_bronze_silver,_delta_silver_gold
0,TSLA,2024,2024-06-13 13:30:00,188.339996,188.339996,186.850006,187.660004,51969,0.0,0.0,2024-06-13 18:03:04.350,2024-06-23 17:50:09.608,2024-06-24 13:19:43.573,2024-06-30 23:03:11.602638,863225,70174,553408
1,TSLA,2024,2024-06-13 13:31:00,187.425003,187.764999,186.365005,186.570007,27054,-1.089996,-0.580836,2024-06-13 18:03:04.350,2024-06-23 17:50:09.608,2024-06-24 13:19:43.573,2024-06-30 23:03:11.602638,863225,70174,553408
2,TSLA,2024,2024-06-13 13:32:00,186.419998,187.059998,186.199997,187.059998,83557,0.48999,0.262631,2024-06-13 18:03:04.350,2024-06-23 17:50:09.608,2024-06-24 13:19:43.573,2024-06-30 23:03:11.602638,863225,70174,553408
3,TSLA,2024,2024-06-13 13:33:00,187.429993,187.699997,186.830002,186.899994,27732,-0.160004,-0.085536,2024-06-13 18:03:04.350,2024-06-23 17:50:09.608,2024-06-24 13:19:43.573,2024-06-30 23:03:11.602638,863225,70174,553408
4,TSLA,2024,2024-06-13 13:34:00,186.919998,187.309998,185.875,186.710007,25161,-0.189987,-0.101652,2024-06-13 18:03:04.350,2024-06-23 17:50:09.608,2024-06-24 13:19:43.573,2024-06-30 23:03:11.602638,863225,70174,553408
5,TSLA,2024,2024-06-13 13:35:00,186.509995,188.0,186.5,187.350006,7246,0.639999,0.342777,2024-06-13 18:03:04.350,2024-06-23 17:50:09.608,2024-06-24 13:19:43.573,2024-06-30 23:03:11.602638,863225,70174,553408
6,TSLA,2024,2024-06-13 13:36:00,187.169998,187.410004,186.649994,186.779999,11454,-0.570007,-0.304247,2024-06-13 18:03:04.351,2024-06-23 17:50:09.608,2024-06-24 13:19:43.573,2024-06-30 23:03:11.602638,863225,70174,553408
7,TSLA,2024,2024-06-13 13:37:00,186.910004,187.570007,186.910004,187.380005,9365,0.600006,0.321237,2024-06-13 18:03:04.351,2024-06-23 17:50:09.608,2024-06-24 13:19:43.573,2024-06-30 23:03:11.602638,863225,70174,553408
8,TSLA,2024,2024-06-13 13:38:00,187.360001,188.649994,187.354996,188.479996,15556,1.099991,0.587037,2024-06-13 18:03:04.351,2024-06-23 17:50:09.608,2024-06-24 13:19:43.573,2024-06-30 23:03:11.602638,863225,70174,553408
9,TSLA,2024,2024-06-13 13:39:00,188.580002,189.690002,188.580002,189.360001,15343,0.880005,0.466896,2024-06-13 18:03:04.351,2024-06-23 17:50:09.608,2024-06-24 13:19:43.573,2024-06-30 23:03:11.602638,863225,70174,553408
