<a href="https://colab.research.google.com/github/logic-language/bitinfochartscraper/blob/main/Main%20Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# User Settings

In [1]:
# What folder do you want to save the results in?
# Individual CSV's for each coin saved with coin name as filename
RESULTS_FOLDER = 'results'

# Setup Notebook

In [2]:
!pip install fastcore
!pip install aiohttp




In [3]:
from fastprogress import progress_bar
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import re
import humanize
from IPython.display import clear_output, display
from pathlib import Path

from fastcore.all import *

import asyncio
import aiohttp

# this is to allow it to run in google colab environment
import nest_asyncio
nest_asyncio.apply()


In [4]:
Path(RESULTS_FOLDER).mkdir(exist_ok=True, parents=True)

#Helper Functions

In [5]:
def parse_strlist(sl):
    clean = re.sub("[\[\],\s]","",sl)
    splitted = re.split("[\'\"]",clean)
    values_only = [s for s in splitted if s != '']
    return values_only

def get_bitinfochart_graph_values(url, var_name):
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')

  scripts = soup.find_all('script')
  for script in scripts:
      if 'd = new Dygraph(document.getElementById("container")' in script.text:
          StrList = script.text
          StrList = '[[' + StrList.split('[[')[-1]
          StrList = StrList.split(']]')[0] +']]'
          StrList = StrList.replace("new Date(", '').replace(')','')
          dataList = parse_strlist(StrList)

  date = []
  value = []
  for each in dataList:
      if (dataList.index(each) % 2) == 0:
          date.append(each)
      else:
          value.append(each)

  df = pd.DataFrame(list(zip(date, value)), columns=["date",var_name])
  return df


In [7]:
# This function takes a list of df's then merge's them on the date field with an outer join

def merge_dfs(df_list):
  df_merged = None
  for i in range(len(df_list)-1):
    if i == 0:
      df_merged = df_list[i].merge(df_list[i+1], on='date', how='outer')
    else:
      df_merged = df_merged.merge(df_list[i+1], on='date', how='outer')

  return df_merged


# Generate List of Pages to Scrape

In [8]:
# Manually copy/pasted to use as a starting point

chart_dict_list = [{'url': 'https://bitinfocharts.com/comparison/bitcoin-transactions.html', 'name': 'transactions'},
                    {'url': 'https://bitinfocharts.com/comparison/size-btc.html', 'name': 'block_size'},
                    {'url': 'https://bitinfocharts.com/comparison/sentbyaddress-btc.html', 'name': 'sent_addresses'},
                    {'url': 'https://bitinfocharts.com/comparison/bitcoin-difficulty.html', 'name': 'difficulty'},
                    {'url': 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html', 'name': 'hashrate'},
                    {'url': 'https://bitinfocharts.com/comparison/bitcoin-mining_profitability.html', 'name': 'mining_profitability'},
                   
                    {'url': 'https://bitinfocharts.com/comparison/sentinusd-btc.html', 'name': 'send_usd'},
                    {'url': 'https://bitinfocharts.com/comparison/bitcoin-transactionfees.html', 'name': 'av_transaction_size'},
                    {'url': 'https://bitinfocharts.com/comparison/bitcoin-median_transaction_fee.html', 'name': 'median_transaction_size'},
                    {'url': 'https://bitinfocharts.com/comparison/bitcoin-confirmationtime.html', 'name': 'confirmation_time'},
                    {'url': 'https://bitinfocharts.com/comparison/bitcoin-marketcap.html', 'name': 'market_cap'},
                    {'url': 'https://bitinfocharts.com/comparison/transactionvalue-btc.html', 'name': 'av_transaction_value'},
                   
                    {'url': 'https://bitinfocharts.com/comparison/mediantransactionvalue-btc.html', 'name': 'median_transaction_value'},
                    {'url': 'https://bitinfocharts.com/comparison/tweets-btc.html', 'name': 'tweets'},
                    {'url': 'https://bitinfocharts.com/comparison/google_trends-btc.html', 'name': 'google_trends'},
                    {'url': 'https://bitinfocharts.com/comparison/activeaddresses-btc.html', 'name': 'active_addresses'},
                    {'url': 'https://bitinfocharts.com/comparison/top100cap-btc.html', 'name': 'top_100_percent'},
                    {'url': 'https://bitinfocharts.com/comparison/fee_to_reward-btc.html', 'name': 'fee_reward'},

                    ]

In [9]:
# Get list of all available coins

url = 'https://bitinfocharts.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

coin_dict_list = []


for span in soup.find_all('span'):
  
  if 's_coins' in str(span.get('class')):
    name = span.get('title').lower()
    coin = span.get('data-coin')
    coin_dict_list.append({'full_name': name,
                'coin': coin})

coin_dict_list[:3]

[{'coin': 'btc', 'full_name': 'bitcoin'},
 {'coin': 'eth', 'full_name': 'ethereum'},
 {'coin': 'xrp', 'full_name': 'xrp'}]

In [10]:
# Combine the list of coins with the list of url's to create a master dictionary with coins and url's to scrape

for coin_dict in coin_dict_list:
  coin_dict['scrape_details'] = []
  for chart_dict in chart_dict_list:
    temp_dict = chart_dict.copy()
    
    url = temp_dict['url']
    url = url.replace('bitcoin', coin_dict['full_name'])
    url = url.replace('btc', coin_dict['coin'])
    url = url.replace(' ', '%20')

    temp_dict['url'] = url
    coin_dict['scrape_details'].append(temp_dict)

coin_dict_list[:2]

[{'coin': 'btc',
  'full_name': 'bitcoin',
  'scrape_details': [{'name': 'transactions',
    'url': 'https://bitinfocharts.com/comparison/bitcoin-transactions.html'},
   {'name': 'block_size',
    'url': 'https://bitinfocharts.com/comparison/size-btc.html'},
   {'name': 'sent_addresses',
    'url': 'https://bitinfocharts.com/comparison/sentbyaddress-btc.html'},
   {'name': 'difficulty',
    'url': 'https://bitinfocharts.com/comparison/bitcoin-difficulty.html'},
   {'name': 'hashrate',
    'url': 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html'},
   {'name': 'mining_profitability',
    'url': 'https://bitinfocharts.com/comparison/bitcoin-mining_profitability.html'},
   {'name': 'send_usd',
    'url': 'https://bitinfocharts.com/comparison/sentinusd-btc.html'},
   {'name': 'av_transaction_size',
    'url': 'https://bitinfocharts.com/comparison/bitcoin-transactionfees.html'},
   {'name': 'median_transaction_size',
    'url': 'https://bitinfocharts.com/comparison/bitcoin-median_

# Synchronous Code Run

- Not recommended
- takes about 20 minutes

In [None]:
coin_merged_df_list = []

for coin_dict in progress_bar(coin_dict_list):
  print(f"Processing - {coin_dict['full_name']}")
  coin_df_list = []
  for page in progress_bar(coin_dict['scrape_details']):
    try:
      coin_df_list.append(get_bitinfochart_graph_values(url=page['url'], var_name=page['name']))
    except:
      empty_df = pd.DataFrame()
      empty_df['full_name'] = coin_dict['full_name']
      empty_df['coin'] = coin_dict['coin']
      coin_df_list.append(pd.DataFrame)
      print(f"Error with {coin_dict['full_name']}")

  coin_df = merge_dfs(coin_df_list)
  coin_df['full_name'] = coin_dict['full_name']
  coin_df['coin'] = coin_dict['coin']

  coin_merged_df_list.append(coin_df)

  clear_output()

  if not COMBINE_ALL:
    file_path = RESULTS_FOLDER + '/' +coin_dict['full_name'] + '.csv'
    coin_df.to_csv(file_path)
  
if COMBINE_ALL:
  combined_df = pd.concat(coin_merged_df_list, ignore_index=True, sort=False)
  combined_df.to_csv(RESULTS_FOLDER + '/' + 'all_bitcoininfochart_data.csv')

Processing - bitcoin gold


In [24]:
combined_df['full_name'].value_counts()

litecoin    3370
xrp         3134
ethereum    1978
Name: full_name, dtype: int64

# Async Helpers

In [12]:
async def fetch(session, url, full_name, coin, var_name):

  try:
    async with session.get(url) as resp:
        return await resp.text(), full_name, coin, var_name
  except:
    return 'failed', full_name, coin, var_name
        # Catch HTTP errors/exceptions here


In [13]:
async def fetch_concurrent(coin_dictionary):
  page_results = []
  loop = asyncio.get_event_loop()


  async with aiohttp.ClientSession() as session:
    tasks = []
    for coin_dict in coin_dictionary:
      for page in coin_dict['scrape_details']:

        tasks.append(loop.create_task(fetch(session, 
                                            page['url'], 
                                            coin_dict['full_name'], 
                                            coin_dict['coin'],
                                            page['name'])))

    for result in asyncio.as_completed(tasks):
      text, full_name, coin, var_name = await result
      result_dict = {'response_text': text,
                     'full_name': full_name,
                     'coin': coin,
                     'var_name': var_name}
      page_results.append(result_dict)
            
      
    return page_results



In [63]:
def extract_info_from_response(response_dict):

  response_text = response_dict['response_text']
  var_name = response_dict['var_name']
  
  soup = BeautifulSoup(response_text, 'html.parser')

  scripts = soup.find_all('script')
  for script in scripts:
      if 'd = new Dygraph(document.getElementById("container")' in script.text:
          StrList = script.text
          StrList = '[[' + StrList.split('[[')[-1]
          StrList = StrList.split(']]')[0] +']]'
          StrList = StrList.replace("new Date(", '').replace(')','')
          dataList = parse_strlist(StrList)

  date = []
  value = []
  for each in dataList:
      if (dataList.index(each) % 2) == 0:
          date.append(each)
      else:
          value.append(each)

  df = pd.DataFrame(list(zip(date, value)), columns=["date",var_name])

  if df[var_name].str.isnumeric().any():
    return df
  
  else:
    return pd.DataFrame(columns=['date', var_name])


# Async Code Run

- Takes about 13s for all the requests
- then takes 6 minutes to convert everything to dataframes

In [71]:
%%time
http_responses = asyncio.run(fetch_concurrent(coin_dict_list))
len(http_responses)

CPU times: user 5.71 s, sys: 3.39 s, total: 9.09 s
Wall time: 13.4 s


In [82]:
coin_merged_df_list = []

for coin_dict in progress_bar(coin_dict_list):
  print(f"Now Processing - {coin_dict['full_name']}")
  coin_df_list = []
  for response in progress_bar(http_responses):
    if coin_dict['full_name'] == response['full_name']:
      coin_df_list.append(extract_info_from_response(response))
  
  coin_df = merge_dfs(coin_df_list)
  coin_df['full_name'] = coin_dict['full_name']
  coin_df['coin'] = coin_dict['coin']
  coin_merged_df_list.append(coin_df)

  file_path = RESULTS_FOLDER + '/' +coin_dict['full_name'] + '__' + coin_dict['coin'] + '.csv'
  coin_df.to_csv(file_path)

  clear_output()

      
combined_df = pd.concat(coin_merged_df_list, ignore_index=True, sort=False)

combined_df.to_csv('all_bitcoininfochart_data.csv')
combined_df.to_pickle('all_bitcoininfochart_data.pkl')

In [84]:
print('Creating Zip File...')
!zip -r -q results.zip {RESULTS_FOLDER}
print('Complete!')

Creating Zip File...
Complete!
