In [0]:
#install required python libraries and load them
#%run ./install_python_package
%pip install bs4
%pip install requests
%pip install wget
%pip install pyspark

In [0]:
import numpy as np
import pandas as pd
import requests
import zipfile
from bs4 import BeautifulSoup
import os
import requests
from os.path import basename
import gzip
import shutil
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.types as st
from pyspark.sql.types import StructType,StructField, StringType

### Steps
  1) Create download folders in DRIVER (Common name for all coins) <br/>
  2) Creates coin specific folder structure in MOUNT  <br/>
  3) Gets the list of zip file download urls for coin type  <br/>
  4) Execute download (Downloaded to DRIVER)  <br/>
  5) Extract the zip files into TSV files (in DRIVER)  <br/>
  6) Move downloaded files, Extracts to coin specific folders in Mount  <br/>
  7) Load TSV files into dataframe  <br/>
  8) Save Selected columns from Dataframe into a CSV file  <br/>
  3) Copy CSV file to FILESTORE to download using Web Browser  <br/>

### Common Methods
These methods do not depend on coin specific folder structure, download things to DRIVER <br/>
1) Create download folders in Driver (Common name for all coins) <br/>
3) Gets the list of zip file download urls for coin type <br/>
4) Execute download (Downloaded to Driver) <br/>
5) Extract the zip files into TSV files (in Driver) <br/>

In [0]:
### Collecting all compressed file download links
def get_download_links(rooturl, key):
    zip_file_urls = []
    url_address = rooturl
    source_code = requests.get(url_address)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, "html.parser")
    extension = 'gz'

    for link in soup.findAll('a'):
        sub_url = link.get('href')
        if sub_url.endswith(extension):
            sub_url = url_address + sub_url +    key
            zip_file_urls.append(sub_url)
    
    zip_file_df = pd.DataFrame({"url_link": zip_file_urls})
    print(zip_file_df.shape[0])
    spark_df = sqlContext.createDataFrame(zip_file_df)
    return(spark_df)
  
    

In [0]:
### Get compressed file links for Bitcoin-Cash Block data       
def get_file_url_links_spark(url_file_path, root_url, key):          
  url_schema_load = st.StructType([st.StructField('url_link', st.StringType(), True)])
  try:
    spark_url_df = spark.read.csv(url_file_path + "url_list.csv", schema= url_schema_load)
    print(spark_url_df.count())
    if spark_url_df.count() > 0:
      return (spark_url_df)
  except:
    print("Url file not found")
    
  spark_url_df = get_download_links(root_url, key)
  spark_url_df.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save(url_file_path + "url_list.csv")
  return spark_url_df


In [0]:
### Dwonload the compressed file and extract the zip file
def downloader_new(row):
    zip_file_name = ""
    log_file = open("/databricks/driver/ZipDownloads/download_failures.txt", "a")
    download_success= True
    try:
        # Download and write to file.
        zip_url = row["url_link"]
        resp = requests.get(zip_url)
        key_idx = basename(resp.url).find("?key")
        zip_file_name = basename(resp.url)[:key_idx]        
        #print(f"the filename is {zip_file_name}")
        zip_file = open("ZipDownloads/"+zip_file_name, 'wb')
        zip_file.write(resp.content)  
        zip_file.close()      
        extract_TSV_File(zip_file_name)
        dl_success.add(1)  
        download_failed = False
        
    except Exception as ex:
      print (ex)
      download_success = False
      dl_fail.add(1)
    finally:
      if download_success:
         log_file.write(zip_file_name+","+"0" "\n")
      else:
        log_file.write(zip_file_name+","+"1" "\n")
        
      log_file.close()
                

In [0]:
def extract_TSV_File(ZipFileName):  
  block_size=65536
  log_file = open("/databricks/driver/ZipDownloads/Extracts/Extract_failures.txt", "a")
  extract_success= True
  ext = os.path.splitext(ZipFileName)
  dest_file_name = ext[0]
  ext= ext[1]
  if (ext == ".gz"): 
    try:
      dest_file_name_full = "ZipDownloads/Extracts/"+ dest_file_name
      with (gzip.open("ZipDownloads/"+ZipFileName, 'rb')) as s_file, \
        open(dest_file_name_full, 'wb') as d_file:
        shutil.copyfileobj(s_file, d_file, block_size)
    except Exception as ex:
      print("Error in extracting")
      extract_success = False
      print(ex)
    finally:
      if extract_success:
         log_file.write(dest_file_name+","+"0" "\n")
      else:
        log_file.write(dest_file_name+","+"1" "\n")
        
      log_file.close()
      

In [0]:
###Extract tsv files from gz compressed files
def extract_load_into_dataframe(file_directory, bitcastdf):  
  block_size=65536
  for file_name in os.listdir(file_directory): #"/databricks/driver/"):
    ext = os.path.splitext(file_name)
    dest_file_name = ext[0]
    ext= ext[1]
  if (ext == ".gz"): 
    try:
      with (gzip.open(file_name, 'rb')) as s_file, \
        open(dest_file_name, 'wb') as d_file:
        shutil.copyfileobj(s_file, d_file, block_size)
        os.remove("/databricks/driver/" + file_name) 
        tsvdf = pd.read_csv(dest_file_name, sep='\t',  lineterminator='\n', names=None)
        bitcastdf = bitcastdf.append(tsvdf)
    except:
      print("Error in extracting")

In [0]:
### Load Extracted TSV Into a DataFrame
def Load_TSV_into_dataframe(dbfs_file_path):
  distFile = sc.textFile(dbfs_file_path)
  return (distFile)    

In [0]:
def create_driver_folders():
  dbutils.fs.mkdirs("file:/databricks/driver/ZipDownloads")
  dbutils.fs.mkdirs("file:/databricks/driver/ZipDownloads/Extracts")

In [0]:
def cleanup_driver_location():
  dbutils.fs.rm("file:/databricks/driver/ZipDownloads/",  True)
  dbutils.fs.rm("file:/databricks/driver/ZipDownloads/Extracts/",  True)

In [0]:
### Create sql table for URL data frame
def Create_Sql_Table(url_spark_df):
  url_spark_df.createOrReplaceTempView("BC_Block_Url")
  q = """select *
  from BC_Block_Url limit 10
  """
  sub_urldf = spark.sql(q)
  sub_urldf.show(2)

In [0]:
### Executes download command on nodes
dl_success = sc.accumulator(0)
dl_fail = sc.accumulator(0)
dl_skip = sc.accumulator(0)

def download_files_in_url_df(url_df):
  %time url_df.foreach(lambda r: downloader_new(r))
  [dl_success.value, dl_skip.value, dl_fail.value]

In [0]:
def create_bc_block_data_folders(Coin_Mount_Loc, TSV_Mount_Loc, Zip_Mount_Loc):
  ### Down load bitcoin cash block level data
  dbutils.fs.mkdirs(Coin_Mount_Loc)
  dbutils.fs.mkdirs(TSV_Mount_Loc)
  dbutils.fs.mkdirs( Zip_Mount_Loc)

In [0]:
### setup download locations in driver
### Create structure at mnt for bitcoin cash
def setup_folder_strucutre(Coin_Mount_Loc, TSV_Mount_Loc, Zip_Mount_Loc):
  cleanup_driver_location()
  create_driver_folders()
  create_bc_block_data_folders(Coin_Mount_Loc, TSV_Mount_Loc, Zip_Mount_Loc)

In [0]:
### Move downloaed compressed and tsv files from driver to DBFS folders
def move_downloads_from_DRIVER_to_MOUNT(TSVFolderPath, ZIPFolderPath):
  dbutils.fs.mv("file:/databricks/driver/ZipDownloads/Extracts", TSVFolderPath, recurse=True)
  dbutils.fs.mv("file:/databricks/driver/ZipDownloads/", ZIPFolderPath, recurse = True)

### (1) BITCOIN-Cash

In [0]:
BC_CASH_MNT_LOC = "dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/"
BC_CASH_MNT_TSV_lOC = "dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData"
BC_CASH_MNT_ZIP_lOC = "dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/ZipFIles"
destlocation_tsv = "dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData/"

In [0]:
def get_bitcoin_cash_blocks_file_url_links():
  url_file_path = "/mnt/BlockChain/Blocks/Bitcoin-cash/"
  key = '?key=202001ZjMvj8R3BF'
  bc_cash_blocks_root = 'https://gz.blockchair.com/bitcoin-cash/blocks/'
  spark_url_df = get_file_url_links_spark (url_file_path, bc_cash_blocks_root, key)  
  return spark_url_df

In [0]:
### Load all TSV files into a dataframe
def Ceate_df_from_BC_Cash_TSV_files():
  #bc_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData/*.tsv")
  bc_block_tsv_rdd_Df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData/*.tsv")
  bc_block_tsv_rdd_Df.count()

In [0]:
### create a csv fiel with subset of columns
### copy to filestore to download the file using web browser
def Create_Export_csv_with_subset_of_columns():
  bc_block_tsv_rdd_Df[["ID", "time","size","median_time","guessed_miner","difficulty","fee_total", "fee_total_usd", "reward", "reward_usd"]].coalesce(1).write.option("header", "true").csv("dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/bc_block_sub_df_exported.csv")
  dbutils.fs.cp("dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/bc_block_sub_df_exported.csv", "dbfs:/FileStore/bc_block_sub_df_exported.csv", recurse = True)

In [0]:
### Get url links to downloade compressed files
### Download compressed files, Extract the downloaded zip files
### Logs both download and extraction failures to separate log files
def download_bc_cash_zip_files():
  spark_url_df = get_bitcoin_cash_blocks_file_url_links()
  download_files_in_url_df(spark_url_df)
  
### Move downloaed compressed and tsv files from driver to DBFS folders
def move_downloads_bc_block_to_dbfs():
  move_downloads_from_DRIVER_to_MOUNT(BC_CASH_MNT_TSV_lOC, BC_CASH_MNT_ZIP_lOC)

####Uncomment below code to Execute the download, Extraction, Creation and Export of CSV file for BitCoin-CASH

In [0]:

#setup_folder_strucutre(BC_CASH_MNT_LOC, BC_CASH_MNT_TSV_lOC, BC_CASH_MNT_ZIP_lOC)
#download_bc_cash_zip_files()
#move_downloads_from_DRIVER_to_MOUNT(BC_CASH_MNT_TSV_lOC,BC_CASH_MNT_ZIP_lOC)
#bc_block_tsv_rdd_Df = Ceate_df_from_BC_Cash_TSV_files()
#Create_Export_csv_with_subset_of_columns()



In [0]:
## Code to debug
##df_rows = spark_url_df.limit(10).collect()
##downloader_new(df_rows[2])
###fname = df_rows[1]['url_link']
###print(fname)
###[dl_success.value, dl_skip.value, dl_fail.value]
###print(postcode_of_lat_long(df_rows[0]['Lat'],df_rows[0]['Long_']))

In [0]:
extract_df = spark.read.csv("dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/ZipFIles/download_failures.txt")
### Create sql table
extract_df.createOrReplaceTempView("extract_log_table")

In [0]:
bc_block_tsv_rdd_Df = Ceate_df_from_BC_Cash_TSV_files()
bc_block_tsv_rdd_Df.columns

### (2) For Doge Coin

In [0]:
#https://gz.blockchair.com/dogecoin/blocks/

In [0]:
DC_MNT_LOC = "dbfs:/mnt/BlockChain/Blocks/Doge-Coin/"
DC_MNT_TSV_lOC = "dbfs:/mnt/BlockChain/Blocks/Doge-Coin/TSVData"
DC_MNT_ZIP_lOC = "dbfs:/mnt/BlockChain/Blocks/Doge-Coin/ZipFIles"
#destlocation_tsv = "dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData/"

In [0]:
#dbutils.fs.mkdirs("dbfs:/mnt/BlockChain/Blocks/Doge-Coin/")
#dbutils.fs.mv(DC_MNT_LOC, "dbfs:/mnt/BlockChain/Blocks/Doge-Coin/", True)

In [0]:
def get_dogecoin_blocks_file_url_links():
  url_file_path = "/mnt/BlockChain/Blocks/Doge-Coin/"
  key = '?key=202001ZjMvj8R3BF'
  dogecoin_blocks_root = 'https://gz.blockchair.com/dogecoin/blocks/'
  spark_url_df = get_file_url_links_spark (url_file_path, dogecoin_blocks_root, key)  
  return spark_url_df

In [0]:
### Load all TSV files into a dataframe
def Ceate_df_from_dogecoin_TSV_files():
  dogecoin_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/Doge-Coin/TSVData/*.tsv")
  dogecoin_block_tsv_rdd_Df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/Doge-Coin/TSVData/*.tsv")
  print(dogecoin_block_tsv_rdd_Df.count())
  return dogecoin_block_tsv_rdd_Df

In [0]:
### Get url links to downloade compressed files
### Download compressed files, Extract the downloaded zip files
### Logs both download and extraction failures to separate log files
def download_dogecoin_zip_files():
  spark_url_df = get_dogecoin_blocks_file_url_links()
  download_files_in_url_df(spark_url_df)
  
### Move downloaed compressed and tsv files from driver to DBFS folders
def move_downloads_dogecoin_block_to_dbfs():
  move_downloads_from_DRIVER_to_MOUNT(DC_MNT_TSV_lOC, DC_MNT_ZIP_lOC)

In [0]:
### Load all TSV files into a dataframe
def Ceate_df_from_DogeCoin_TSV_files():
  #dogecoin_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/Doge_Coin/TSVData/*.tsv")
  dogecoin_block_tsv_rdd_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/Doge-Coin/TSVData/*.tsv")
  return dogecoin_block_tsv_rdd_df
  

In [0]:
def Create_Export_DogeCoin_Subset_Columns():
  dogecoin_block_tsv_rdd_df[["ID","time","median_time","size","guessed_miner","difficulty","fee_total","fee_total_usd","reward","reward_usd","is_aux"]].coalesce(1).write.option("header", "true").csv("dbfs:/mnt/BlockChain/Blocks/Doge-Coin/dogecoin_block_tsv_rdd_df.csv")
  dbutils.fs.cp("dbfs:/mnt/BlockChain/Blocks/Doge-Coin/dogecoin_block_tsv_rdd_df.csv", "dbfs:/FileStore/dogecoin_block_tsv_rdd_df.csv", recurse = True)


#### Get DogeCoin Block Data

In [0]:
## Code to debug
#df_rows = spark_url_df.limit(10).collect()
#downloader_new(df_rows[2])
##move_downloads_from_DRIVER_to_MOUNT(DC_MNT_TSV_lOC, DC_MNT_ZIP_lOC)

###fname = df_rows[1]['url_link']
###print(fname)
###[dl_success.value, dl_skip.value, dl_fail.value]
##p#rint(postcode_of_lat_long(df_rows[0]['Lat'],df_rows[0]['Long_']))

In [0]:
dogecoin_block_tsv_rdd_df.show(5)

In [0]:
##setup_folder_strucutre(DC_MNT_LOC, DC_MNT_TSV_lOC, DC_MNT_ZIP_lOC)
#download_dogecoin_zip_files()
#[dl_success.value, dl_skip.value, dl_fail.value]
#move_downloads_from_DRIVER_to_MOUNT(DC_MNT_TSV_lOC, DC_MNT_ZIP_lOC)
#dogecoin_block_tsv_rdd_df = Ceate_df_from_DogeCoin_TSV_files()
#Create_Export_DogeCoin_Subset_Columns()

In [0]:
#len(dbutils.fs.ls("dbfs:/mnt/BlockChain/Blocks/Doge_Coin/TSVData"))
dogecoin_block_tsv_rdd_df.columns

In [0]:
dogecoin_block_tsv_rdd_df.count()

In [0]:

##%fs rm -r "/mnt/BlockChain/Blocks/Bitcoin-cash/url_list.csv"
#dbutils.fs.ls("dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/url_list.csv")
#dbutils.fs.ls("file:/databricks/driver/ZipDownloads/")
#len(dbutils.fs.ls("dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/ZipFIles/"))
#len(dbutils.fs.ls("dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData"))
#dbutils.fs.rm("dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/bc_block_sub_df_exported.csv", recurse = True)
#bc_block_tsv_rdd_Df[["guessed_miner"]].collect().unique()["guessed_miner"]
#dbutils.fs.ls("dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/ZipFIles/download_failures.txt")
#tsv_files_path = "dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData/blockchair_bitcoin-cash_blocks_*.tsv"


### (3) Dash coin

https://gz.blockchair.com/dash/blocks/

In [0]:
### 1
D_MNT_LOC = "dbfs:/mnt/BlockChain/Blocks/Dash/"
D_MNT_TSV_lOC = "dbfs:/mnt/BlockChain/Blocks/Dash/TSVData"
D_MNT_ZIP_lOC = "dbfs:/mnt/BlockChain/Blocks/Dash/ZipFIles"
#destlocation_tsv = "dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData/"

In [0]:
### 2
def get_dash_blocks_file_url_links():
  url_file_path = "/mnt/BlockChain/Blocks/Dash/"
  key = '?key=202001ZjMvj8R3BF'
  dogecoin_blocks_root = 'https://gz.blockchair.com/dash/blocks/'
  spark_url_df = get_file_url_links_spark (url_file_path, dogecoin_blocks_root, key)  
  return spark_url_df

In [0]:
### 3
### Load all TSV files into a dataframe
def Ceate_df_from_dash_TSV_files():
  dash_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/dash/TSVData/*.tsv")
  dash_block_tsv_rdd_Df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/dash/TSVData/*.tsv")
  print(dash_block_tsv_rdd_Df.count())
  return dash_block_tsv_rdd_Df

In [0]:
### 4
### Get url links to downloade compressed files
### Download compressed files, Extract the downloaded zip files
### Logs both download and extraction failures to separate log files
def download_dash_zip_files():
  spark_url_df = get_dash_blocks_file_url_links()
  download_files_in_url_df(spark_url_df)
  
### Move downloaed compressed and tsv files from driver to DBFS folders
def move_dash_block_to_dbfs():
  move_downloads_from_DRIVER_to_MOUNT(D_MNT_TSV_lOC, D_MNT_ZIP_lOC)


In [0]:
###5 
### Load all TSV files into a dataframe
def Ceate_df_from_Dash_TSV_files():
  #dogecoin_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/Doge_Coin/TSVData/*.tsv")
  dash_block_tsv_rdd_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/Dash/TSVData/*.tsv")
  return dash_block_tsv_rdd_df
  

In [0]:
###6
def Create_Export_Dash_Subset_Columns():
  dash_block_tsv_rdd_df[["ID","time","median_time","size","guessed_miner","difficulty","fee_total","fee_total_usd","reward","reward_usd"]].coalesce(1).write.option("header", "true").csv("dbfs:/mnt/BlockChain/Blocks/Dash/dash_block_tsv_rdd_df.csv")
  dbutils.fs.cp("dbfs:/mnt/BlockChain/Blocks/Dash/dash_block_tsv_rdd_df.csv", "dbfs:/FileStore/dash_block_tsv_rdd_df.csv", recurse = True)

In [0]:
##setup_folder_strucutre(D_MNT_LOC, D_MNT_TSV_lOC, D_MNT_ZIP_lOC)
##download_dash_zip_files()
##[dl_success.value, dl_skip.value, dl_fail.value]
##move_downloads_from_DRIVER_to_MOUNT(D_MNT_TSV_lOC, D_MNT_ZIP_lOC)
##dash_block_tsv_rdd_df = Ceate_df_from_Dash_TSV_files()
Create_Export_Dash_Subset_Columns()

In [0]:
[dl_success.value, dl_skip.value, dl_fail.value]

In [0]:
dash_block_tsv_rdd_df.count()

In [0]:
dash_block_tsv_rdd_df.columns

In [0]:
len(dbutils.fs.ls(D_MNT_TSV_lOC))

(4)  BITCOIN SV

In [0]:
##https://gz.blockchair.com/bitcoin-sv/blocks/

In [0]:
### 2
def get_bitcoin_sv_blocks_file_url_links():
  url_file_path = "/mnt/BlockChain/Blocks/BC-SV/"
  key = '?key=202001ZjMvj8R3BF'
  bcsv_blocks_root = 'https://gz.blockchair.com/bitcoin-sv/blocks/'
  spark_url_df = get_file_url_links_spark (url_file_path, bcsv_blocks_root, key)  
  return spark_url_df

In [0]:
### 3
### Load all TSV files into a dataframe
def Ceate_df_from_BS_SV_TSV_files():
  bc_sv_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/BC-SV/TSVData/*.tsv")
  bc_sv_block_tsv_rdd_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/BC-SV/TSVData/*.tsv")
 
  return bc_sv_block_tsv_rdd_df

In [0]:
### 4
### Get url links to downloade compressed files
### Download compressed files, Extract the downloaded zip files
### Logs both download and extraction failures to separate log files
def download_bc_sv_zip_files():
  spark_url_df = get_bitcoin_sv_blocks_file_url_links()
  download_files_in_url_df(spark_url_df)
  

In [0]:
###5 
### Load all TSV files into a dataframe
def Ceate_df_from_bcsv_TSV_files():
  #dogecoin_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/Doge_Coin/TSVData/*.tsv")
  bc_sv_block_tsv_rdd_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/BC-SV/TSVData/*.tsv")
  return bc_sv_block_tsv_rdd_df

In [0]:
###6
def Create_Export_BCSV_Subset_Columns():
  df_bcsv_block_tsv_rdd_df[["ID","time","median_time","size","guessed_miner","difficulty","fee_total","fee_total_usd","reward","reward_usd"]].coalesce(1).write.option("header", "true").csv("dbfs:/mnt/BlockChain/Blocks/BC-SV/bcsv_block_tsv_rdd_df.csv")
  dbutils.fs.cp("dbfs:/mnt/BlockChain/Blocks/BC-SV/bcsv_block_tsv_rdd_df.csv", "dbfs:/FileStore/bcsv_block_tsv_rdd_df.csv", recurse = True)

Execute below code

In [0]:
### 1
BC_SV_MNT_LOC = "dbfs:/mnt/BlockChain/Blocks/BC-SV/"
BC_SV_MNT_TSV_lOC = "dbfs:/mnt/BlockChain/Blocks/BC-SV/TSVData"
BC_SV_MNT_ZIP_lOC = "dbfs:/mnt/BlockChain/Blocks/BC-SV/ZipFIles"
#destlocation_tsv = "dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData/"

In [0]:
##setup_folder_strucutre(BC_SV_MNT_LOC, BC_SV_MNT_TSV_lOC, BC_SV_MNT_ZIP_lOC)
##download_bc_sv_zip_files()
##[dl_success.value, dl_skip.value, dl_fail.value]
##move_downloads_from_DRIVER_to_MOUNT(BC_SV_MNT_TSV_lOC, BC_SV_MNT_ZIP_lOC)
##df_bcsv_block_tsv_rdd_df = Ceate_df_from_bcsv_TSV_files()
##Create_Export_BCSV_Subset_Columns()

In [0]:
print ([dl_success.value, dl_skip.value, dl_fail.value])

In [0]:
df_bcsv_block_tsv_rdd_df.columns

In [0]:
df_bcsv_block_tsv_rdd_df.count()

(5) LITE COIN

In [0]:
def get_litecoin_blocks_file_url_links():
  url_file_path = "/mnt/BlockChain/Blocks/LiteCoin/"
  key = '?key=202001ZjMvj8R3BF'
  litecoin_blocks_root = 'https://gz.blockchair.com/litecoin/blocks/'
  spark_url_df = get_file_url_links_spark (url_file_path, litecoin_blocks_root, key)  
  return spark_url_df

In [0]:
### 3
### Load all TSV files into a dataframe
def Ceate_df_from_LiteCoin_TSV_files():
  litecoin_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/LiteCoin/TSVData/*.tsv")
  litecoin_block_tsv_rdd_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/LiteCoin/TSVData/*.tsv")
 
  return litecoin_block_tsv_rdd_df

In [0]:
### 4
### Get url links to downloade compressed files
### Download compressed files, Extract the downloaded zip files
### Logs both download and extraction failures to separate log files
def download_litecoin_zip_files():
  spark_url_df = get_litecoin_blocks_file_url_links()
  download_files_in_url_df(spark_url_df)  

In [0]:
###5 
### Load all TSV files into a dataframe
def Ceate_df_from_litecoin_TSV_files():
  litecoin_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/LiteCoin/TSVData/*.tsv")
  litecoin_block_tsv_rdd_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/LiteCoin/TSVData/*.tsv")
  return litecoin_block_tsv_rdd_df

In [0]:
###6
def Create_Export_litecoin_Subset_Columns(block_df):
  block_df[["ID","time","median_time","size","guessed_miner","difficulty","fee_total","fee_total_usd","reward","reward_usd"]].coalesce(1).write.option("header", "true").csv("dbfs:/mnt/BlockChain/Blocks/LiteCoin/block_df.csv")
  dbutils.fs.cp("dbfs:/mnt/BlockChain/Blocks/LiteCoin/block_df.csv", "dbfs:/FileStore/block_df.csv", recurse = True)

Execute below code

In [0]:
### 1
LC_MNT_LOC = "dbfs:/mnt/BlockChain/Blocks/LiteCoin/"
LC_MNT_TSV_lOC = "dbfs:/mnt/BlockChain/Blocks/LiteCoin/TSVData"
LC_MNT_ZIP_lOC = "dbfs:/mnt/BlockChain/Blocks/LiteCoin/ZipFIles"
#destlocation_tsv = "dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData/"

In [0]:
##setup_folder_strucutre(LC_MNT_LOC, LC_MNT_TSV_lOC, LC_MNT_ZIP_lOC)
##download_litecoin_zip_files()
#[dl_success.value, dl_skip.value, dl_fail.value]
##move_downloads_from_DRIVER_to_MOUNT(LC_MNT_TSV_lOC, LC_MNT_ZIP_lOC)
##litecoin_block_tsv_rdd_df = Ceate_df_from_litecoin_TSV_files()
##Create_Export_litecoin_Subset_Columns(litecoin_block_tsv_rdd_df)

In [0]:
litecoin_block_tsv_rdd_df.count()

In [0]:
litecoin_block_tsv_rdd_df.count()

In [0]:
[dl_success.value, dl_skip.value, dl_fail.value]

### (6) Ethereum

In [0]:
dl_success.value = 0
dl_skip.value= 0
dl_fail.value = 0

In [0]:
def get_ethereum_blocks_file_url_links():
  url_file_path = "/mnt/BlockChain/Blocks/Ethereum/"
  key = '?key=202001ZjMvj8R3BF'
  litecoin_blocks_root = 'https://gz.blockchair.com/ethereum/blocks/'
  spark_url_df = get_file_url_links_spark (url_file_path, litecoin_blocks_root, key)  
  return spark_url_df

In [0]:
### 3
### Load all TSV files into a dataframe
def Ceate_df_from_ethereum_TSV_files():
  ethereum_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/Ethereum/TSVData/*.tsv")
  ethereum_block_tsv_rdd_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/Ethereum/TSVData/*.tsv")
 
  return litecoin_block_tsv_rdd_df

In [0]:
### 4
### Get url links to downloade compressed files
### Download compressed files, Extract the downloaded zip files
### Logs both download and extraction failures to separate log files
def download_ethereum_zip_files():
  spark_url_df = get_ethereum_blocks_file_url_links()
  download_files_in_url_df(spark_url_df)  

In [0]:
###5 
### Load all TSV files into a dataframe
def Ceate_df_from_ethereum_TSV_files():
  ethereum_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/Ethereum/TSVData/*.tsv")
  ethereum_block_tsv_rdd_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/Ethereum/TSVData/*.tsv")
  return ethereum_block_tsv_rdd_df

In [0]:
###6
def Create_Export_Ethereum_Subset_Columns():
  df_ethereum_block_tsv_rdd_df[["ID","time","size","miner","difficulty","fee_total","fee_total_usd","reward","reward_usd"]].coalesce(1).write.option("header", "true").csv("dbfs:/mnt/BlockChain/Blocks/Ethereum/df_ethereum_block_tsv_rdd_df.csv")
  dbutils.fs.cp("dbfs:/mnt/BlockChain/Blocks/Ethereum/df_ethereum_block_tsv_rdd_df.csv", "dbfs:/FileStore/df_ethereum_block_tsv_rdd_df.csv", recurse = True)

Execute below code to get Ethereum

In [0]:
### 1
EC_MNT_LOC = "dbfs:/mnt/BlockChain/Blocks/Ethereum/"
EC_MNT_TSV_lOC = "dbfs:/mnt/BlockChain/Blocks/Ethereum/TSVData"
EC_MNT_ZIP_lOC = "dbfs:/mnt/BlockChain/Blocks/Ethereum/ZipFIles"
#destlocation_tsv = "dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData/"

In [0]:
##setup_folder_strucutre(EC_MNT_LOC, EC_MNT_TSV_lOC, EC_MNT_ZIP_lOC)
##download_ethereum_zip_files()
##[dl_success.value, dl_skip.value, dl_fail.value]
##move_downloads_from_DRIVER_to_MOUNT(EC_MNT_TSV_lOC, EC_MNT_ZIP_lOC)
##df_ethereum_block_tsv_rdd_df = Ceate_df_from_ethereum_TSV_files()
##Create_Export_Ethereum_Subset_Columns()

In [0]:
[dl_success.value, dl_skip.value, dl_fail.value] 

In [0]:
display(df_ethereum_block_tsv_rdd_df.show(5))

In [0]:
df_ethereum_block_tsv_rdd_df.columns

In [0]:
dbutils.fs.ls("file:/databricks/driver/")

In [0]:
len(dbutils.fs.ls(LC_MNT_TSV_lOC))

### (7) BIT COIN
https://gz.blockchair.com/bitcoin/blocks/

In [0]:
dl_success.value = 0
dl_skip.value= 0
dl_fail.value = 0

In [0]:
def get_bitcoin_blocks_file_url_links():
  url_file_path = "/mnt/BlockChain/Blocks/BitCoin/"
  key = '?key=202001ZjMvj8R3BF'
  litecoin_blocks_root = 'https://gz.blockchair.com/bitcoin/blocks/'
  spark_url_df = get_file_url_links_spark (url_file_path, litecoin_blocks_root, key)  
  return spark_url_df

In [0]:
### 3
### Load all TSV files into a dataframe
def Ceate_df_from_bitcoin_TSV_files():
  bitcoin_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/BitCoin/TSVData/*.tsv")
  bitcoin_block_tsv_rdd_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/BitCoin/TSVData/*.tsv")
 
  return bitcoin_block_tsv_rdd_df

In [0]:
### 4
### Get url links to downloade compressed files
### Download compressed files, Extract the downloaded zip files
### Logs both download and extraction failures to separate log files
def download_bitcoin_zip_files():
  spark_url_df = get_bitcoin_blocks_file_url_links()
  download_files_in_url_df(spark_url_df)  

In [0]:
###5 
### Load all TSV files into a dataframe
def Ceate_df_from_bitcoin_TSV_files():
  bitcoin_block_tsv_rdd = sc.textFile("dbfs:/mnt/BlockChain/Blocks/BitCoin/TSVData/*.tsv")
  bitcoin_block_tsv_rdd_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', delimiter='\t', inferschema='true').load("/mnt/BlockChain/Blocks/BitCoin/TSVData/*.tsv")
  return bitcoin_block_tsv_rdd_df

In [0]:
###6
def Create_Export_bitcoin_Subset_Columns():
  bitcoin_block_tsv_rdd_df[["ID","time","median_time", "size","guessed_miner","difficulty","fee_total","fee_total_usd","reward","reward_usd"]].coalesce(1).write.option("header", "true").csv("dbfs:/mnt/BlockChain/Blocks/BitCoin/bitcoin_block_tsv_rdd_df.csv")
  dbutils.fs.cp("dbfs:/mnt/BlockChain/Blocks/BitCoin/bitcoin_block_tsv_rdd_df.csv", "dbfs:/FileStore/bitcoin_block_tsv_rdd_df.csv", recurse = True)

Execute below code for BitCoin data

In [0]:
### 1
BC_MNT_LOC = "dbfs:/mnt/BlockChain/Blocks/BitCoin/"
BC_MNT_TSV_lOC = "dbfs:/mnt/BlockChain/Blocks/BitCoin/TSVData"
BC_MNT_ZIP_lOC = "dbfs:/mnt/BlockChain/Blocks/BitCoin/ZipFIles"
#destlocation_tsv = "dbfs:/mnt/BlockChain/Blocks/Bitcoin-cash/TSVData/"

In [0]:
##setup_folder_strucutre(BC_MNT_LOC, BC_MNT_TSV_lOC, BC_MNT_ZIP_lOC)
##download_bitcoin_zip_files()
[dl_success.value, dl_skip.value, dl_fail.value]
##move_downloads_from_DRIVER_to_MOUNT(BC_MNT_TSV_lOC, BC_MNT_ZIP_lOC)
##bitcoin_block_tsv_rdd_df = Ceate_df_from_bitcoin_TSV_files()
##Create_Export_bitcoin_Subset_Columns()

In [0]:
bitcoin_block_tsv_rdd_df.count()

In [0]:
bitcoin_block_tsv_rdd_df.columns

In [0]:
bitcoin_block_tsv_rdd_df.count()

In [0]:
[dl_success.value, dl_skip.value, dl_fail.value]