# Explorar a Camada Bronze

Analisar os dados na camada bronze para compreender o cenário e identificar as transformações necessárias para criar a camada Silver

## Importar bibliotecar

In [1]:
import ipywidgets as widgets
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import dates
#
# Nome da aplicação Spark
#
APP_NAME="ExplorarBronze"

## Inicializar Spark e definir funções de apoio

In [2]:
%run StartSpark.ipynb

## Carregar as tabelas da camada Bronze

In [3]:
_ = loadAndRegister("stocks", "StockData", "bronze")

## Listar as tabelas e mostrar algumas informações sobre os dados

Lista de tabelas.

In [4]:
%%sql_display
show tables

Unnamed: 0,namespace,tableName,isTemporary
0,,bronze_stocks,False


Quantidade de linhas por tipo de ação e tipo de operação

In [5]:
%%sql_display
select 
    ticker,
    __op,
    count(*) as qtd
from bronze_stocks
group by all
order by ticker, __op

Unnamed: 0,ticker,__op,qtd
0,AAPL,c,1181
1,AAPL,r,143
2,AMZN,c,1166
3,AMZN,r,141
4,GOOG,c,1034
5,GOOG,r,121
6,MSFT,c,1092
7,MSFT,r,142
8,NU,c,328
9,TSLA,c,1138


Quantidade de entradas por ação

In [6]:
%%time
%%sql_display 
select 
    ticker,
    count(*) as qtd
from bronze_stocks
group by all
order by ticker

CPU times: user 7.22 ms, sys: 0 ns, total: 7.22 ms
Wall time: 1.22 s


Unnamed: 0,ticker,qtd
0,AAPL,1324
1,AMZN,1307
2,GOOG,1155
3,MSFT,1234
4,NU,328
5,TSLA,1282


## Gerar um gráfico de variação no valor das ações

Primeiramente definimos a query

In [7]:
df = %sql select ticker, timestamp - interval 3 hours as timestamp, close, volume from bronze_stocks where timestamp >= current_timestamp - interval 24 hours order by ticker, timestamp

Definição dos elementos da toolbar

In [8]:
ticker = widgets.Dropdown(
    options=["AAPL", "AMZN", "GOOG", "MSFT", "TSLA", "NU"],
    value="AAPL",
    description="Stock:",
    disabled=False,
)

button = widgets.Button(
    description='Refresh',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Refresh Charts',
    icon='arrows-rotate' # (FontAwesome names without the `fa-` prefix)
)

Funções para plotar os gráficos

In [9]:
def plotTickerValue(ticker):
    _df = df.toPandas()
    colors = sns.color_palette('pastel')[0:5]
    
    sns.set_style("ticks",{'axes.grid' : True})

    _, ax = plt.subplots()

    g = sns.lineplot(_df[_df.ticker == ticker], x="timestamp", y="close", ax=ax)
    _ = g.tick_params(axis="x", rotation=45) 
    _ = plt.title(f"Variação no Valor das Ações da {ticker}")
    _ = ax.xaxis.set_major_formatter(dates.DateFormatter("%y-%m-%d %H:%M"))

def plotTickerVolume(ticker):
    _df = df.toPandas()
    colors = sns.color_palette('pastel')[0:5]
    
    sns.set_style("ticks",{'axes.grid' : True})

    _, ax = plt.subplots()
    g = sns.lineplot(_df[_df.ticker == ticker], x="timestamp", y="volume", ax=ax)
    _ = g.tick_params(axis="x", rotation=45) 
    _ = plt.title(f"Variação no Volume Negociado da {ticker}")
    _ = ax.xaxis.set_major_formatter(dates.DateFormatter("%y-%m-%d %H:%M"))

### Renderizar a inteface definida

In [10]:
toolbar = widgets.HBox([ticker, button])
w1 = widgets.interactive_output(plotTickerValue, {"ticker": ticker})
w2 = widgets.interactive_output(plotTickerVolume, {"ticker": ticker})
ui = widgets.HBox([w1, w2])
display(toolbar, ui)

def refresh(b):
    # plotTickerValue(ticker.value)
    # plotTickerVolume(ticker.value)
    display(toolbar, ui)

button.on_click(refresh)

HBox(children=(Dropdown(description='Stock:', options=('AAPL', 'AMZN', 'GOOG', 'MSFT', 'TSLA', 'NU'), value='A…

HBox(children=(Output(), Output()))

## Preparar transformações para a camada Silver

Apresentar uma amostragem dos dados

In [11]:
%%sql_display
select * 
from bronze_stocks
order by ticker, timestamp

Unnamed: 0,_id,ticker,description,timestamp,open,high,low,close,volume,__op,__collection,__ts_ms
0,65c4b8f1b1a1d02887e6aea8,AAPL,Apple Inc.,2024-02-07 20:59:00,189.279999,189.404999,189.279999,189.369995,12342,c,stocks,1707794955502
1,65ca6bf9987f38773dfb0d83,AAPL,Apple Inc.,2024-02-12 18:35:00,188.0,188.020004,187.970001,188.0,1958,r,stocks,1707780766019
2,65ca6bf9987f38773dfb0d82,AAPL,Apple Inc.,2024-02-12 18:36:00,188.009995,188.059998,188.0,188.0,2529,r,stocks,1707780766019
3,65ca6bf9987f38773dfb0d81,AAPL,Apple Inc.,2024-02-12 18:37:00,187.990005,188.035004,187.990005,188.024994,1856,r,stocks,1707780766019
4,65ca6bf9987f38773dfb0d80,AAPL,Apple Inc.,2024-02-12 18:38:00,188.059998,188.089996,187.949997,188.009995,1758,r,stocks,1707780766019
5,65ca6bf9987f38773dfb0d7f,AAPL,Apple Inc.,2024-02-12 18:39:00,187.904999,187.910004,187.850006,187.880005,2157,r,stocks,1707780766018
6,65ca6bf9987f38773dfb0d7e,AAPL,Apple Inc.,2024-02-12 18:40:00,187.839996,187.850006,187.789993,187.850006,1471,r,stocks,1707780766018
7,65ca6bf9987f38773dfb0d7d,AAPL,Apple Inc.,2024-02-12 18:41:00,187.830002,187.845001,187.725006,187.755005,5641,r,stocks,1707780766018
8,65ca6bf9987f38773dfb0d7c,AAPL,Apple Inc.,2024-02-12 18:42:00,187.695007,187.75,187.660004,187.660004,1586,r,stocks,1707780766018
9,65ca6bf9987f38773dfb0d7b,AAPL,Apple Inc.,2024-02-12 18:43:00,187.660004,187.669998,187.634995,187.649994,2037,r,stocks,1707780766018


### Transformações

Analisando os dados acima identificamos as seguintes transformações
1. **Particionamento de dados**: visando aumentar a performance, os dados serão particionados por _ticker_ (ação) e pelo dia (_timestamp_ formatado como YYYY-MM-DD). Para isso será adicionado uma coluna chamada _day_.
2. **Oscilação de valores**: baseando-se no campo _close_ (valor de fechamento do período atual), retirar seu valor do período anterior (função _lag_). Adicionando a coluna _osc_ com este valor.
3. **Percentual de Oscilação**: baseado no campo _osc_, criar uma coluna chamada _osc_per_ com o percentual da variação.
4. **Ajustar __timestamp__**: O campo *__ts_ms* que o horário em que o _kafka connector_ capturou o registro vem em formato _unix timestamp_ e precisa ser convertido para um formato mais amigável.

In [12]:
df_silver = %sql \
SELECT \
    _id, \
    ticker, \
    date_format(timestamp, "yyyy-MM-dd") as day, \
    description, \
    timestamp, \
    open, \
    high, \
    low, \
    close, \
    volume,\
    (close - LAG(close,1) OVER (PARTITION BY ticker ORDER BY timestamp)) AS osc, \
    (osc * 100.0 / LAG(close,1) OVER (PARTITION BY ticker ORDER BY timestamp)) as osc_per, \
    __op, \
    __collection, \
    (to_timestamp(__ts_ms / 1000) - interval 5 hours) as __ts_ms \
from bronze_stocks \
where ticker = "AAPL"

In [13]:
d = df_silver.fillna(value=0)

In [14]:
d.toPandas()

Unnamed: 0,_id,ticker,day,description,timestamp,open,high,low,close,volume,osc,osc_per,__op,__collection,__ts_ms
0,65c4b8f1b1a1d02887e6aea8,AAPL,2024-02-07,Apple Inc.,2024-02-07 20:59:00,189.279999,189.404999,189.279999,189.369995,12342,0.000000,0.000000,c,stocks,2024-02-12 22:29:15.502
1,65ca6bf9987f38773dfb0d83,AAPL,2024-02-12,Apple Inc.,2024-02-12 18:35:00,188.000000,188.020004,187.970001,188.000000,1958,-1.369995,-0.723449,r,stocks,2024-02-12 18:32:46.019
2,65ca6bf9987f38773dfb0d82,AAPL,2024-02-12,Apple Inc.,2024-02-12 18:36:00,188.009995,188.059998,188.000000,188.000000,2529,0.000000,0.000000,r,stocks,2024-02-12 18:32:46.019
3,65ca6bf9987f38773dfb0d81,AAPL,2024-02-12,Apple Inc.,2024-02-12 18:37:00,187.990005,188.035004,187.990005,188.024994,1856,0.024994,0.013295,r,stocks,2024-02-12 18:32:46.019
4,65ca6bf9987f38773dfb0d80,AAPL,2024-02-12,Apple Inc.,2024-02-12 18:38:00,188.059998,188.089996,187.949997,188.009995,1758,-0.014999,-0.007977,r,stocks,2024-02-12 18:32:46.019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1321,65ce7b1a244121727fa6c185,AAPL,2024-02-15,Apple Inc.,2024-02-15 20:57:00,183.664993,184.464996,183.664993,184.350006,47967,0.680008,0.370234,c,stocks,2024-02-15 15:59:38.301
1322,65ce7b542f19bbee335bbee1,AAPL,2024-02-15,Apple Inc.,2024-02-15 20:58:00,184.330002,184.350006,184.029999,184.085007,34761,-0.264999,-0.143748,c,stocks,2024-02-15 16:00:27.967
1323,65ce7b8e8e27f01662b41591,AAPL,2024-02-15,Apple Inc.,2024-02-15 20:59:00,184.115005,184.130005,183.845001,184.005005,51636,-0.080002,-0.043459,c,stocks,2024-02-15 16:01:18.146
1324,65cf71e10162f78e5724aee5,AAPL,2024-02-16,Apple Inc.,2024-02-16 14:30:00,183.800003,183.830002,183.699997,183.710007,1786,-0.294998,-0.160321,c,stocks,2024-02-16 09:32:13.163
