# The following is the workflow for web scrapping bitcoin indicators from "[bitinfocarts.com](https://bitinfocharts.com/comparison/bitcoin-transactions.html#alltime)"

In [86]:
# Importing Required Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime


In [87]:
# Define URLs and Column Names
urls = ['https://bitinfocharts.com/comparison/bitcoin-price.html#alltime',
        'https://bitinfocharts.com/comparison/bitcoin-transactions.html#alltime',
        'https://bitinfocharts.com/comparison/size-btc.html#alltime',
        'https://bitinfocharts.com/comparison/sentbyaddress-btc.html#alltime',
        'https://bitinfocharts.com/comparison/bitcoin-difficulty.html#alltime',

        'https://bitinfocharts.com/comparison/bitcoin-hashrate.html#alltime',
        'https://bitinfocharts.com/comparison/sentinusd-btc.html#alltime',
        'https://bitinfocharts.com/comparison/bitcoin-transactionfees.html#alltime',
        'https://bitinfocharts.com/comparison/bitcoin-median_transaction_fee.html#alltime',
        'https://bitinfocharts.com/comparison/bitcoin-confirmationtime.html#alltime',

        'https://bitinfocharts.com/comparison/transactionvalue-btc.html#alltime',
        'https://bitinfocharts.com/comparison/mediantransactionvalue-btc.html#alltime',
        'https://bitinfocharts.com/comparison/activeaddresses-btc.html#alltime',
        ]


column_names = ['btc_price', 'btc_trans_blockchain', 'avg_block_size', 'unique_sentbyaddress', 'avg_mining_diff', 
                'avg_hashrate', 'sent_in_USD', 'avg_trans_fee', 'median_trans_fee','avg_block_confirm_time_min', 
                'avg_trans_value_usd', 'median_trans_value_usd', 'unique_tofromaddress' ]


In [88]:
# Scrape Data and Create DataFrames
df_dict = {}

for idx, url in enumerate(urls):
    # Send a request to fetch the HTML content
    response = requests.get(url)
    html_content = response.text

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the data container
    data_container = soup.find_all('script')
    for script in data_container:
        if 'd = new Dygraph(document.getElementById("container")' in script.text:
            data_script = script.text
            break

    data_regex = re.compile(r"\[\[new Date\(\".*?\]\]")
    raw_data_match = data_regex.search(data_script)
    raw_data_string = raw_data_match.group(0)
    entry_regex = re.compile(r"new Date\(\"(\d{4}/\d{2}/\d{2})\"\),([0-9.E+-]+|\bnull\b)")
    entries = entry_regex.findall(raw_data_string)

    # Convert entries into a list of tuples (date, hashrate)
    parsed_data = []
    for entry in entries:
        date_str, hashrate_str = entry
        date = datetime.strptime(date_str, '%Y/%m/%d')
        hashrate = None if hashrate_str == 'null' else float(hashrate_str)
        parsed_data.append((date, hashrate))
    
    # Step 3: Create a DataFrame
    df = pd.DataFrame(parsed_data, columns=['Date', column_names[idx]])
    df_dict[column_names[idx]] = df
    
df_dict

{'btc_price':            Date   btc_price
 0    2010-07-17      0.0495
 1    2010-07-18      0.0726
 2    2010-07-19      0.0859
 3    2010-07-20      0.0783
 4    2010-07-21      0.0767
 ...         ...         ...
 5017 2024-04-11  70454.0000
 5018 2024-04-12  69556.0000
 5019 2024-04-13  66583.0000
 5020 2024-04-14  64151.0000
 5021 2024-04-15  65552.0000
 
 [5022 rows x 2 columns],
 'btc_trans_blockchain':            Date  btc_trans_blockchain
 0    2009-01-03                   NaN
 1    2009-01-04                   NaN
 2    2009-01-05                   NaN
 3    2009-01-06                   NaN
 4    2009-01-07                   NaN
 ...         ...                   ...
 5576 2024-04-10              530903.0
 5577 2024-04-11              476109.0
 5578 2024-04-12              448353.0
 5579 2024-04-13              507009.0
 5580 2024-04-14              473171.0
 
 [5581 rows x 2 columns],
 'avg_block_size':            Date  avg_block_size
 0    2009-01-03           204.0
 1    2

In [89]:
df_dict_copy = df_dict.copy()

In [90]:
# Combine All DataFrames
for idx in range(len(column_names)):
    if idx == 0:
        df_combined = df_dict_copy[column_names[idx]]
        df_combined.set_index('Date', inplace=False)
        df_combined['Date'] = pd.to_datetime(df_combined['Date'])  # Convert the 'Date' column to datetime objects
    else:
        df2 = df_dict[column_names[idx]]
        df2['Date'] = pd.to_datetime(df2['Date'])  # Convert the 'Date' column to datetime objects
        df2.set_index('Date', inplace=True)
        df_combined = df_combined.join(df2, on=['Date'], how='outer', sort=True)
df_combined

Unnamed: 0,Date,btc_price,btc_trans_blockchain,avg_block_size,unique_sentbyaddress,avg_mining_diff,avg_hashrate,sent_in_USD,avg_trans_fee,median_trans_fee,avg_block_confirm_time_min,avg_trans_value_usd,median_trans_value_usd,unique_tofromaddress
,2009-01-03,,,204.0,,1.000000e+00,,,,,,,,
,2009-01-04,,,,,,,,,,,,,
,2009-01-05,,,,,,,,,,,,,
,2009-01-06,,,,,,,,,,,,,
,2009-01-07,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5017.0,2024-04-11,70454.0,476109.0,755978.0,377904.0,8.638856e+13,6.271128e+20,1.869756e+10,15.446,8.201,10.909,120368.0,132.712,652545.0
5018.0,2024-04-12,69556.0,448353.0,730556.0,381751.0,8.638856e+13,5.282425e+20,1.994199e+10,24.391,13.849,10.588,123775.0,240.210,711174.0
5019.0,2024-04-13,66583.0,507009.0,689059.0,354343.0,8.638856e+13,6.582811e+20,1.319369e+10,13.855,6.621,8.780,83871.0,97.422,734096.0
5020.0,2024-04-14,64151.0,473171.0,686918.0,275714.0,8.638856e+13,6.941267e+20,1.077752e+10,10.326,4.558,9.172,90926.0,83.048,741201.0


In [91]:
df_combined.set_index('Date', inplace=True)
df_combined

Unnamed: 0_level_0,btc_price,btc_trans_blockchain,avg_block_size,unique_sentbyaddress,avg_mining_diff,avg_hashrate,sent_in_USD,avg_trans_fee,median_trans_fee,avg_block_confirm_time_min,avg_trans_value_usd,median_trans_value_usd,unique_tofromaddress
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2009-01-03,,,204.0,,1.000000e+00,,,,,,,,
2009-01-04,,,,,,,,,,,,,
2009-01-05,,,,,,,,,,,,,
2009-01-06,,,,,,,,,,,,,
2009-01-07,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-11,70454.0,476109.0,755978.0,377904.0,8.638856e+13,6.271128e+20,1.869756e+10,15.446,8.201,10.909,120368.0,132.712,652545.0
2024-04-12,69556.0,448353.0,730556.0,381751.0,8.638856e+13,5.282425e+20,1.994199e+10,24.391,13.849,10.588,123775.0,240.210,711174.0
2024-04-13,66583.0,507009.0,689059.0,354343.0,8.638856e+13,6.582811e+20,1.319369e+10,13.855,6.621,8.780,83871.0,97.422,734096.0
2024-04-14,64151.0,473171.0,686918.0,275714.0,8.638856e+13,6.941267e+20,1.077752e+10,10.326,4.558,9.172,90926.0,83.048,741201.0


In [92]:
# Save the Combined DataFrame
df_combined.to_csv('bitcoin_data_bitcoininfocharts.csv', index=True, encoding='utf-8')