# 📊 Constructor de Dataset: Blockchain de Bitcoin (BTC) / Métricas On-chain

**Fuente**: [Dataset Builder Bitcoin BTC Network On-Chain](https://www.kaggle.com/code/aleexharris/dataset-builder-bitcoin-btc-network-on-chain)

In [1]:
# =============================================================================
# LIBRERIAS
# =============================================================================

import requests
import time
import random
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta
from itertools import permutations
from typing import Callable
import os

In [2]:
def to_snake_case(text: str) -> str:
    replacements = {' ': '_', '-': '_', '(': '', ')': '', ">": '', ',': ''}
    snake_case_text = text.lower().translate(str.maketrans(replacements)).replace("___", '_').replace("__", '_')
    return snake_case_text

## 🔹 Scraping BlockChain.com

In [3]:
def get_months_since_genesis() -> list[int]:
    genesis = datetime(2009, 1, 1)
    current_year = datetime.now().year
    years = np.arange(2009, current_year + 1, step=1)  # hasta el año actual incluido
    months = np.arange(1, 13, step=1)
    years, months = np.meshgrid(years, months)
    perms = np.column_stack((years.ravel(), months.ravel()))
    order = np.lexsort((perms[:, 1], perms[:, 0]))
    perms = perms[order]
    ts = [
        int(datetime(r[0], r[1], 1).timestamp())
        for r in perms
        if datetime(r[0], r[1], 1).date() < datetime.now().date()
    ]
    return ts

def blockchain_dot_com_get_request(endpoint: str, month: int) -> tuple[dict, str, str] | None:
    base_url= "https://api.blockchain.info/charts/"
    params = {"timespan": "5weeks", "format": "json", "start": month}
    for retry in range(5):
        try:
            r = requests.get(base_url + endpoint, params=params)
            data = r.json()["values"]
            desc = r.json()["description"]
            name = to_snake_case(r.json()["name"])
            return data, desc, name
        except Exception as e:
            time.sleep(random.randint(1, retry + 1))
            if retry == 4:
                msg = f"""Blockchain.com API endpoint seems broken...\n
                baseurl: {base_url}\n
                endpoint: {endpoint}\n
                params: {params}\n
                month: {datetime.from_timestamp(month)}\n
                exception: {type(e)}\n
                exception_args: {e.args}"""
                print(msg)
    else:
        return None

def get_blockchain_dot_com_endpoint_data(endpoint: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    column_data = []
    for month in get_months_since_genesis():
        response = blockchain_dot_com_get_request(endpoint, month)
        if not response:
            continue
        data, desc, name = response
        column_data += data
    column_df = pd.DataFrame(column_data).drop_duplicates().rename(columns={'x': "datetime", 'y': name})
    column_df["datetime"] = pd.to_datetime(column_df["datetime"], unit='s')
    describe_df = pd.DataFrame({"description": desc}, index=[name])
    return column_df, describe_df
    
def get_all_blockchain_dot_com_data() -> tuple[pd.DataFrame, pd.DataFrame]:
    endpoints = ["mempool-size", "transactions-per-second", "market-cap", "avg-block-size", "market-price",
                 "trade-volume", "avg-confirmation-time", "hash-rate", "difficulty", "miners-revenue", "transaction-fees"]
    timeseries_df = pd.Series([], dtype=int)
    info_df = pd.Series([], dtype=int)
    for e in tqdm(endpoints, desc="Blockchain.com endpoints scraped"):
        column_df, describe_df = get_blockchain_dot_com_endpoint_data(e)
        if timeseries_df.empty:
            timeseries_df = column_df
        else:
            timeseries_df = pd.merge(timeseries_df, column_df, on='datetime', how='outer')
        if info_df.empty:
            info_df = describe_df
        else:
            info_df = pd.concat([info_df, describe_df])
    return timeseries_df, info_df

bdc_timeseries_df, bdc_info_df = get_all_blockchain_dot_com_data()

Blockchain.com endpoints scraped: 100%|████████████████████████████████████████████████| 11/11 [06:07<00:00, 33.38s/it]


### 🧹 Limpieza de datos de Blockchain.com

**Nota**: El primer bloque de Bitcoin se minó el 3 de enero de 2009 por Satoshi, pero el segundo no se minó hasta el 9 de enero, 6 días después, de ahí el período de datos nulos para las métricas on-chain entre el 3 y el 9 de enero de 2009.

In [4]:
def clean_blockchain_dot_com_data(ts_df: pd.DataFrame, info_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    rename_map = {"datetime": "Open time", "market_capitalization": "market_cap_usd", "usd_exchange_trade_volume": "exchange_volume_usd"}
    ts_df = ts_df.rename(columns=rename_map)
    
    ts_df["Open time"] = pd.to_datetime(ts_df["Open time"])
    
    daily_data = (
        ts_df.sort_values("Open time")
        .groupby(ts_df["Open time"].dt.date)
        .median(numeric_only=False)
        .iloc[:-1]
        .fillna(0)
        .drop("Open time", axis=1)
    )
    
    info_data = info_df.rename(index=rename_map)
    info_data.at["average_confirmation_time", "description"] = (
        "The average time taken for a transaction to be combined "
        "in a Bitcoin block with other transactions and added to the blockchain."
    )
    
    return daily_data.reset_index(), info_data.reset_index()

bdc_daily_data, bdc_info_data = clean_blockchain_dot_com_data(bdc_timeseries_df, bdc_info_df)

In [5]:
# Guardamos datos en ficheros csv
list_df = [
    (bdc_daily_data, 'bdc_daily_data'),
    (bdc_info_data, 'bdc_info_data')
]

def save_data(list_df):
    folder = 'data/blockchain'
    if not os.path.exists(folder):
        os.makedirs(folder)

    for df, filename in list_df:
        full_path = os.path.join(folder, f"{filename}.csv")
        df.to_csv(full_path, index=False)

    print("All DataFrames have been successfully saved in the 'blockchain-data' folder.")

save_data(list_df)

All DataFrames have been successfully saved in the 'blockchain-data' folder.


In [6]:
bdc_daily_data

Unnamed: 0,Open time,mempool_size,transaction_rate,market_cap_usd,average_block_size,market_price_usd,exchange_volume_usd,average_confirmation_time,hash_rate,difficulty,miners_revenue,total_transaction_fees
0,2009-01-03,0.00,0.000000,0.000000e+00,0.000000,0.00,0.000000e+00,0.000000,4.971027e-08,1.000000e+00,0.000000e+00,0.000000
1,2009-01-04,0.00,0.000000,0.000000e+00,0.000000,0.00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
2,2009-01-05,0.00,0.000000,0.000000e+00,0.000000,0.00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
3,2009-01-06,0.00,0.000000,0.000000e+00,0.000000,0.00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
4,2009-01-07,0.00,0.000000,0.000000e+00,0.000000,0.00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
6069,2025-08-16,771281.50,2.975000,2.344190e+12,1.665970,117419.50,3.855598e+08,8.231568,9.780076e+08,1.294352e+14,5.629125e+07,2.989912
6070,2025-08-17,562122.00,2.658333,2.352079e+12,1.523959,117484.12,1.547407e+08,5.932923,9.780076e+08,1.294352e+14,5.711806e+07,2.540358
6071,2025-08-18,609931.00,3.316667,2.305944e+12,1.562829,117455.50,1.693708e+08,10.626850,9.908761e+08,1.294352e+14,5.770019e+07,3.392547
6072,2025-08-19,910637.75,3.283333,2.294257e+12,1.739986,116251.12,4.406906e+08,12.908552,8.621909e+08,1.294352e+14,4.993566e+07,3.270740


In [7]:
bdc_info_data

Unnamed: 0,index,description
0,mempool_size,The aggregate size of transactions waiting to ...
1,transaction_rate,The number of Bitcoin transactions added to th...
2,market_cap_usd,The total USD value of bitcoin supply in circu...
3,average_block_size,The average block size in MB.
4,market_price_usd,Average USD market price across major bitcoin ...
5,exchange_volume_usd,The total USD value of trading volume on major...
6,average_confirmation_time,The average time taken for a transaction to be...
7,hash_rate,The estimated number of tera hashes per second...
8,difficulty,A relative measure of how difficult it is to f...
9,miners_revenue,Total value of coinbase block rewards and tran...
