# Explorar a Camada Bronze

Analisar os dados na camada bronze para compreender o cenário e identificar as transformações necessárias para criar a camada Silver

## Importar bibliotecar

In [1]:
import ipywidgets as widgets
import matplotlib.pyplot as plt
import pyspark.pandas as ps
import seaborn as sns

from matplotlib import dates
#
# Nome da aplicação Spark
#
APP_NAME="ExplorarBronze"
ps.options.display.max_rows = 10



## Inicializar Spark e definir funções de apoio

In [2]:
%run StartSpark.ipynb

## Carregar as tabelas da camada Bronze

In [3]:
_ = loadAndRegister("stocks", "StockData", "bronze")

## Listar as tabelas e mostrar algumas informações sobre os dados

Lista de tabelas.

In [4]:
%%sql_display
show tables

Unnamed: 0,namespace,tableName,isTemporary
0,,bronze_stocks,False


Quantidade de linhas por tipo de ação e tipo de operação

In [5]:
%%sql_display
select 
    ticker,
    __op,
    count(*) as qtd
from bronze_stocks
group by all
order by ticker, __op

Unnamed: 0,ticker,__op,qtd
0,AAPL,c,22233
1,AAPL,r,810
2,AMZN,c,21660
3,AMZN,r,810
4,GOOG,c,20128
5,GOOG,r,795
6,MSFT,c,19842
7,MSFT,r,783
8,NU,c,19599
9,NU,r,753


Quantidade de entradas por ação

In [6]:
%%time
%%sql_display 
select 
    ticker,
    count(*) as qtd
from bronze_stocks
group by all
order by ticker

CPU times: user 9.52 ms, sys: 0 ns, total: 9.52 ms
Wall time: 1.61 s


Unnamed: 0,ticker,qtd
0,AAPL,23043
1,AMZN,22470
2,GOOG,20923
3,MSFT,20625
4,NU,20352
5,TSLA,22748


## Gerar um gráfico de variação no valor das ações

Primeiramente definimos a query

In [7]:
df = %sql select ticker, timestamp - interval 3 hours as timestamp, close, volume from bronze_stocks where timestamp >= current_timestamp - interval 24 hours order by ticker, timestamp

Definição dos elementos da toolbar

In [8]:
ticker = widgets.Dropdown(
    options=["AAPL", "AMZN", "GOOG", "MSFT", "TSLA", "NU"],
    value="AAPL",
    description="Stock:",
    disabled=False,
)

button = widgets.Button(
    description='Refresh',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Refresh Charts',
    icon='arrows-rotate' # (FontAwesome names without the `fa-` prefix)
)

Funções para plotar os gráficos

In [9]:
def plotTickerValue(ticker):
    _df = df.toPandas()
    colors = sns.color_palette('pastel')[0:5]
    
    sns.set_style("ticks",{'axes.grid' : True})

    _, ax = plt.subplots()

    g = sns.lineplot(_df[_df.ticker == ticker], x="timestamp", y="close", ax=ax)
    _ = g.tick_params(axis="x", rotation=45) 
    _ = plt.title(f"Variação no Valor das Ações da {ticker}")
    _ = ax.xaxis.set_major_formatter(dates.DateFormatter("%y-%m-%d %H:%M"))

def plotTickerVolume(ticker):
    _df = df.toPandas()
    colors = sns.color_palette('pastel')[0:5]
    
    sns.set_style("ticks",{'axes.grid' : True})

    _, ax = plt.subplots()
    g = sns.lineplot(_df[_df.ticker == ticker], x="timestamp", y="volume", ax=ax)
    _ = g.tick_params(axis="x", rotation=45) 
    _ = plt.title(f"Variação no Volume Negociado da {ticker}")
    _ = ax.xaxis.set_major_formatter(dates.DateFormatter("%y-%m-%d %H:%M"))

### Renderizar a inteface definida

In [10]:
toolbar = widgets.HBox([ticker, button])
w1 = widgets.interactive_output(plotTickerValue, {"ticker": ticker})
w2 = widgets.interactive_output(plotTickerVolume, {"ticker": ticker})
ui = widgets.HBox([w1, w2])
display(toolbar, ui)

def refresh(b):
    # plotTickerValue(ticker.value)
    # plotTickerVolume(ticker.value)
    display(toolbar, ui)

button.on_click(refresh)

HBox(children=(Dropdown(description='Stock:', options=('AAPL', 'AMZN', 'GOOG', 'MSFT', 'TSLA', 'NU'), value='A…

HBox(children=(Output(), Output()))

## Preparar transformações para a camada Silver

Apresentar uma amostragem dos dados

In [11]:
%%sql_display
select * 
from bronze_stocks
order by ticker, timestamp

Unnamed: 0,_id,ticker,description,timestamp,open,high,low,close,volume,__op,__collection,__ts_ms
0,65df36b58327c289d0a4d9a1,AAPL,,2024-02-01 14:30:00,183.666595,184.086105,183.606705,183.986206,35940,r,stocks,1709129206064
1,65df36b58327c289d0a4d9a2,AAPL,,2024-02-01 14:31:00,184.051102,184.180893,183.796402,184.006195,12306,r,stocks,1709129206064
2,65df36b58327c289d0a4d9a3,AAPL,,2024-02-01 14:32:00,183.966202,184.615402,183.806396,184.395706,10555,r,stocks,1709129206065
3,65df36b58327c289d0a4d9a4,AAPL,,2024-02-01 14:33:00,184.300797,184.785202,184.205902,184.7202,12731,r,stocks,1709129206065
4,65df36b58327c289d0a4d9a5,AAPL,,2024-02-01 14:34:00,184.745193,184.974899,184.585403,184.745193,15014,r,stocks,1709129206065
5,65df36b58327c289d0a4d9a6,AAPL,,2024-02-01 14:35:00,184.865097,185.019897,184.705307,184.984894,17421,r,stocks,1709129206065
6,65df36b58327c289d0a4d9a7,AAPL,,2024-02-01 14:36:00,184.795197,184.925003,184.585403,184.685303,12392,r,stocks,1709129206066
7,65df36b68327c289d0a4d9a8,AAPL,,2024-02-01 14:37:00,184.740204,184.955002,184.640305,184.785202,4108,r,stocks,1709129206066
8,65df36b68327c289d0a4d9a9,AAPL,,2024-02-01 14:38:00,184.835098,184.964905,184.655304,184.705307,4765,r,stocks,1709129206066
9,65df36b68327c289d0a4d9aa,AAPL,,2024-02-01 14:39:00,184.715302,185.084793,184.715302,185.014893,5856,r,stocks,1709129206066


### Transformações

Analisando os dados acima identificamos as seguintes transformações
1. **Particionamento de dados**: visando aumentar a performance, os dados serão particionados por _ticker_ (ação) e pelo dia (_timestamp_ formatado como YYYY-MM-DD). Para isso será adicionado uma coluna chamada _day_.
2. **Oscilação de valores**: baseando-se no campo _close_ (valor de fechamento do período atual), retirar seu valor do período anterior (função _lag_). Adicionando a coluna _osc_ com este valor.
3. **Percentual de Oscilação**: baseado no campo _osc_, criar uma coluna chamada _osc_per_ com o percentual da variação.
4. **Ajustar __timestamp__**: O campo *__ts_ms* que o horário em que o _kafka connector_ capturou o registro vem em formato _unix timestamp_ e precisa ser convertido para um formato mais amigável.

In [12]:
df_silver = %sql \
SELECT \
    _id, \
    ticker, \
    date_format(timestamp, "yyyy-MM-dd") as day, \
    description, \
    timestamp, \
    open, \
    high, \
    low, \
    close, \
    volume,\
    (close - LAG(close,1) OVER (PARTITION BY ticker ORDER BY timestamp)) AS osc, \
    (osc * 100.0 / LAG(close,1) OVER (PARTITION BY ticker ORDER BY timestamp)) as osc_per, \
    __op, \
    __collection, \
    (to_timestamp(__ts_ms / 1000) - interval 5 hours) as __ts_ms \
from bronze_stocks \
where ticker = "AAPL"

In [13]:
d = df_silver.fillna(value=0)

In [14]:
d.pandas_api()

Unnamed: 0,_id,ticker,day,description,timestamp,open,high,low,close,volume,osc,osc_per,__op,__collection,__ts_ms
0,65df36b58327c289d0a4d9a1,AAPL,2024-02-01,,2024-02-01 14:30:00,183.666595,184.086105,183.606705,183.986206,35940,0.0,0.0,r,stocks,2024-02-28 09:06:46.064
1,65df36b58327c289d0a4d9a2,AAPL,2024-02-01,,2024-02-01 14:31:00,184.051102,184.180893,183.796402,184.006195,12306,0.019989,0.010864,r,stocks,2024-02-28 09:06:46.064
2,65df36b58327c289d0a4d9a3,AAPL,2024-02-01,,2024-02-01 14:32:00,183.966202,184.615402,183.806396,184.395706,10555,0.389511,0.211684,r,stocks,2024-02-28 09:06:46.065
3,65df36b58327c289d0a4d9a4,AAPL,2024-02-01,,2024-02-01 14:33:00,184.300797,184.785202,184.205902,184.7202,12731,0.324493,0.175977,r,stocks,2024-02-28 09:06:46.065
4,65df36b58327c289d0a4d9a5,AAPL,2024-02-01,,2024-02-01 14:34:00,184.745193,184.974899,184.585403,184.745193,15014,0.024994,0.013531,r,stocks,2024-02-28 09:06:46.065
5,65df36b58327c289d0a4d9a6,AAPL,2024-02-01,,2024-02-01 14:35:00,184.865097,185.019897,184.705307,184.984894,17421,0.2397,0.129746,r,stocks,2024-02-28 09:06:46.065
6,65df36b58327c289d0a4d9a7,AAPL,2024-02-01,,2024-02-01 14:36:00,184.795197,184.925003,184.585403,184.685303,12392,-0.299591,-0.161954,r,stocks,2024-02-28 09:06:46.066
7,65df36b68327c289d0a4d9a8,AAPL,2024-02-01,,2024-02-01 14:37:00,184.740204,184.955002,184.640305,184.785202,4108,0.099899,0.054092,r,stocks,2024-02-28 09:06:46.066
8,65df36b68327c289d0a4d9a9,AAPL,2024-02-01,,2024-02-01 14:38:00,184.835098,184.964905,184.655304,184.705307,4765,-0.079895,-0.043237,r,stocks,2024-02-28 09:06:46.066
9,65df36b68327c289d0a4d9aa,AAPL,2024-02-01,,2024-02-01 14:39:00,184.715302,185.084793,184.715302,185.014893,5856,0.309586,0.167611,r,stocks,2024-02-28 09:06:46.066
