### Coleta e persistência dos dados neste notebook

**tipo de ingestão**: full load (recomendado para ingestão de dados inicial no datalake)

**origem**: Yahoo finance, Wikipedia, Fear and Greed (alternative.me)

**destino**: bronze/database/bitcoin/raw/btc.csv

**formato do data no data lake**: .csv


In [72]:
import logging
import boto3
from botocore.exceptions import ClientError
import pandas as pd
from transformers import pipeline
import yfinance as yf
import mwclient
import time
import requests
import json
from datetime import datetime, timedelta
from statistics import mean
from io import StringIO

Extrai os dados de bitcoin do Yahoo Finance

In [3]:
ticker = yf.Ticker("BTC-USD")

In [18]:
data_inicio = datetime.strptime('2018-02-01', '%Y-%m-%d')

In [20]:
btc = ticker.history(start=data_inicio)

Formata a base de bitcoin

In [22]:
btc.index = pd.to_datetime(btc.index).tz_localize(None)

In [23]:
del btc["Dividends"]
del btc["Stock Splits"]

In [24]:
btc.columns = [c.lower() for c in btc.columns]

Extrai os dados das reviews de bitcoin da wikipedia

In [47]:
site = mwclient.Site('en.wikipedia.org')
page = site.pages['Bitcoin']

In [48]:
revs = list(page.revisions(start=data_inicio, dir='newer'))

In [49]:
revs = sorted(revs, key=lambda rev: rev["timestamp"])

Classifica o sentimento relacionado as reviews

seleciona o modelo utilizado para classificação das reviews

In [52]:
sentiment_pipeline = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english")



transforma o score das classificações negativas em valores negativos

In [53]:
def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

cria a base de sentimentos

In [54]:
edits = {}

for rev in revs:        
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
    
    edits[date]["edit_count"] += 1
    
    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))

prepara e limpa a base

In [55]:
for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
    
    del edits[key]["sentiments"]

In [56]:
edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [57]:
edits_df.index = pd.to_datetime(edits_df.index)

In [60]:
dates = pd.date_range(start=data_inicio, end=datetime.today())

In [62]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [63]:
rolling_edits = edits_df.rolling(30, min_periods=30).mean()

In [64]:
rolling_edits = rolling_edits.dropna()

Extrai a base de Fear and Greed da Alternative.me

In [92]:
def collect_fear_greed(api_url:str, limit:int, drop_colls:list, rename_colls:list) -> pd.DataFrame:
    response = requests.get(f'{api_url}?limit={limit}')
    n_data = response.json()
    df = pd.read_json(json.dumps(n_data['data']))
    df.drop(columns=drop_colls, axis=1, inplace=True)
    df.rename(columns=rename_colls, inplace=True)
    df.set_index('date', inplace=True)
    return df

In [93]:
api_url = 'https://api.alternative.me/fng/'
colls_to_drop = ['time_until_update']
coll_names = {'value':'fng_index', 
              'value_classification':'fng_classification',
              'timestamp': 'date'}

data_fg = collect_fear_greed(api_url=api_url, limit=0, drop_colls=colls_to_drop, rename_colls=coll_names)

  df = pd.read_json(json.dumps(n_data['data']))


In [95]:
data_fg.sort_index()

Unnamed: 0_level_0,fng_index,fng_classification
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-02-01,30,Fear
2018-02-02,15,Extreme Fear
2018-02-03,40,Fear
2018-02-04,24,Extreme Fear
2018-02-05,11,Extreme Fear
...,...,...
2024-09-01,26,Fear
2024-09-02,26,Fear
2024-09-03,26,Fear
2024-09-04,27,Fear


In [97]:
data_fg = data_fg.sort_index()

Une as bases de bitcoin, sentimentos e fear and greed

In [79]:
btc = btc.merge(rolling_edits, left_index=True, right_index=True)

In [80]:
btc = btc.merge(data_fg, left_index=True, right_index=True)

Prepara a coluna target

In [82]:
btc["tomorrow"] = btc["close"].shift(-1)

In [83]:
btc["target"] = (btc["tomorrow"] > btc["close"]).astype(int)

Organiza as colunas

In [100]:
btc.columns

Index(['open', 'high', 'low', 'close', 'volume', 'edit_count', 'sentiment',
       'neg_sentiment', 'fng_index', 'fng_classification', 'tomorrow',
       'target'],
      dtype='object')

In [101]:
sequencia = ['open', 'high', 'low', 'close', 'volume', 'edit_count', 'tomorrow','sentiment',
       'neg_sentiment', 'fng_index', 'fng_classification', 'target']
btc = btc[sequencia]

In [102]:
btc

Unnamed: 0,open,high,low,close,volume,edit_count,tomorrow,sentiment,neg_sentiment,fng_index,fng_classification,target
2018-03-02,10977.400391,11189.000000,10850.099609,11086.400391,7620590080,3.066667,11489.700195,-0.307225,0.541296,47,Neutral,1
2018-03-03,11101.900391,11528.200195,11002.400391,11489.700195,6690570240,3.066667,11512.599609,-0.307225,0.541296,56,Greed,1
2018-03-04,11497.400391,11512.599609,11136.099609,11512.599609,6084149760,2.933333,11573.299805,-0.258349,0.513519,44,Fear,1
2018-03-05,11532.400391,11704.099609,11443.900391,11573.299805,6468539904,2.866667,10779.900391,-0.259235,0.496852,55,Greed,0
2018-03-06,11500.099609,11500.099609,10694.299805,10779.900391,6832169984,2.666667,9965.570312,-0.249632,0.474630,59,Greed,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-01,58969.800781,59062.070312,57217.824219,57325.488281,24592449997,0.166667,59112.480469,0.024292,0.066667,26,Fear,1
2024-09-02,57326.968750,59403.070312,57136.027344,59112.480469,27036454524,0.166667,57431.023438,0.024292,0.066667,26,Fear,0
2024-09-03,59106.191406,59815.058594,57425.167969,57431.023438,26666961053,0.166667,57971.539062,0.024292,0.066667,26,Fear,1
2024-09-04,57430.347656,58511.570312,55673.164062,57971.539062,35627680312,0.166667,56159.136719,0.024292,0.066667,27,Fear,0


Realiza a ingestão do dataframe no data lake

In [3]:
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')

In [85]:
# Converte o dataframe em uma string CSV
csv_buffer = StringIO()
btc.to_csv(csv_buffer, index=True)

In [5]:
bucket_name = "dl-general-prd"
csv_file_name = 'btc.csv'
bucket_layer = 'bronze/database/bitcoin/raw'
object_name = f'{bucket_layer}/{csv_file_name}'

In [95]:
def upload_to_s3(bucket_name, object_name, csv_buffer):
    try:
        s3_resource = boto3.resource('s3')
        req_metadata = s3_resource.Object(bucket_name, object_name).put(Body=csv_buffer.getvalue())
        status = req_metadata['ResponseMetadata']['HTTPStatusCode']
        if status == 200:
            print('Upload sucessful')
        else:
            print(f'Upload failed, status {status}')
    except ClientError as e:
        logging.error(e)

upload_to_s3(bucket_name, object_name, csv_buffer)

ERROR:root:An error occurred (AccessDenied) when calling the PutObject operation: User: arn:aws:sts::432393163228:assumed-role/voclabs/user3315249=brisamnascimento@gmail.com is not authorized to perform: s3:PutObject on resource: "arn:aws:s3:::dl-general-prd/bronze/database/bitcoin/raw/btc.csv" with an explicit deny in an identity-based policy


In [103]:
# baixa csv localmente
#btc.to_csv('btc.csv')