In [1]:
import requests
from alchemy_api import AlchemyApi
import db_connection as db_conn
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import backoff
from requests.exceptions import RequestException
import pandas as pd
import numpy as np
import mysql.connector
from sqlalchemy import create_engine
import threading
import math
import pickle
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, FloatType



In [16]:
alchemy = AlchemyApi()

In [None]:
def api_key_rotation(api_keys):
    index = random.randint(0, len(api_keys) - 1)
    api_key = api_keys[index]
    return api_key

def get_block_chunks(start_block, end_block, step=3):
    ranges = []
    for i in range(start_block, end_block - step + 2, step):
        ranges.append((i, i + step - 1))
    if ranges[-1][1] < end_block:
        ranges.append((ranges[-1][1] + 1, end_block))
    return ranges

@backoff.on_exception(backoff.expo, 
                      (RequestException, KeyError), 
                      max_tries=8)

def fetch_transactions(block_range):
    transactions = []
    failed_ranges = []
    ranges_with_pagination = []
    try:
        start_block_hex = alchemy.convert_block_to_hex(block_range[0])
        end__block_hex = alchemy.convert_block_to_hex(block_range[1])
        api_key = api_key_rotation(alchemy.get_api_keys())
        alchemy_url = alchemy.get_api_url() + api_key
        transfers_body = alchemy.create_asset_transfers()
        transfers_body = alchemy.set_tx_block_range(transfers_body, block_from=start_block_hex, block_to=end__block_hex)
        response_tx = requests.post(alchemy_url, json=transfers_body)
        response_tx_json = response_tx.json()
        if len(response_tx_json["result"]["transfers"]) > 1000:
            ranges_with_pagination.append(block_range)
        transactions = [{"hash": tx["hash"], "blockNumber": tx["blockNum"], "from": tx["from"], "to": tx["to"], "value": tx["value"], "erc721TokenId": tx["erc721TokenId"], "erc1155Metadata": tx["erc1155Metadata"], 
                        "tokenId": tx["tokenId"], "asset": tx["asset"], "category": tx["category"], "timestamp": tx['metadata']['blockTimestamp']} for tx in response_tx_json["result"]["transfers"]]
    except (RequestException, KeyError) as e:
        print(f"Request failed with exception: {e}. Retrying...")
        print(f"API key used: {api_key}")
        failed_ranges.append(block_range)
        raise

    return transactions, failed_ranges, ranges_with_pagination

start_block = 14999390
end_block = 15053225
block_ranges = get_block_chunks(start_block, end_block, 3)

all_transactions = []
all_failed_ranges = []
all_pagination_ranges = []

lock = threading.Lock()

with ThreadPoolExecutor(max_workers=6) as executor:
    futures = [executor.submit(fetch_transactions, block_range) for block_range in block_ranges]
    for future in as_completed(futures):
        transactions, failed_ranges, ranges_with_pagination = future.result()
        with lock:
            all_transactions.extend(transactions)
            all_failed_ranges.extend(failed_ranges)
            all_pagination_ranges.extend(ranges_with_pagination)

print(len(all_transactions))
transaction_blocks = set(tx['blockNumber'] for tx in all_transactions)
# Filter out the failed ranges that have block numbers in all transactions
all_failed_ranges = [block_range for block_range in all_failed_ranges 
                     if alchemy.convert_block_to_hex(block_range[0]) 
                     not in transaction_blocks and alchemy.convert_block_to_hex(block_range[1]) not in transaction_blocks]

In [18]:
print(all_failed_ranges)
print(all_pagination_ranges)

[]
[]


In [19]:
with open('data/all_transactions_june_3.pickle', 'wb') as f:
    pickle.dump(all_transactions, f)

In [20]:
df_june1 = pd.DataFrame(pd.read_pickle('data/all_transactions_june_1.pickle'))
print(df_june1.shape[0])
df_june1.to_parquet('data/june1_tx.parquet')
df_june2 = pd.DataFrame(pd.read_pickle('data/all_transactions_june_2.pickle'))
print(df_june2.shape[0])
df_june2.to_parquet('data/june2_tx.parquet')
df_june3 = pd.DataFrame(pd.read_pickle('data/all_transactions_june_3.pickle'))
print(df_june3.shape[0])
df_june3.to_parquet('data/june3_tx.parquet')

16038297


In [2]:
# Define the schema for your DataFrame
schema = StructType([
    StructField("hash", StringType()),
    StructField("blockNumber", StringType()),
    StructField("from", StringType()),
    StructField("to", StringType()),
    StructField("value", FloatType()),
    StructField("erc721TokenId", StringType()),
    StructField("erc1155Metadata", StringType()),
    StructField("tokenId", StringType()),
    StructField("asset", StringType()),
    StructField("category", StringType()),
    StructField("timestamp", StringType())
])

# Create a SparkSession
spark = SparkSession.builder \
    .appName("process_df_tx") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "32g") \
    .getOrCreate()

# Load the parquet files into a DataFrames and merge them
df_tx_june_1 = spark.read.parquet('data/june1_tx.parquet')
df_tx_june_2 = spark.read.parquet('data/june2_tx.parquet')
df_tx_june_3 = spark.read.parquet('data/june3_tx.parquet')
df_tx = df_tx_june_1.union(df_tx_june_2).union(df_tx_june_3)
df_tx = df_tx.repartition(100)
# Split 'erc1155Metadata' into two columns 'erc1155TokenId' and 'erc1155Value'
df_tx = df_tx.withColumn('erc1155_token_id', when(col('erc1155Metadata').isNotNull() & (col('erc1155Metadata').getItem(0).isNotNull()), col('erc1155Metadata').getItem(0).getItem('tokenId')).otherwise(lit(None)))
df_tx = df_tx.withColumn('erc1155_value', when(col('erc1155Metadata').isNotNull() & (col('erc1155Metadata').getItem(0).isNotNull()), col('erc1155Metadata').getItem(0).getItem('value')).otherwise(lit(None)))
df_tx = df_tx.drop('erc1155Metadata')

# Cast columns to correct data types
df_tx = df_tx.withColumn("value", col("value").cast("float"))
df_tx = df_tx.withColumn("erc1155_value", col("erc1155_value").cast("float"))
df_tx = df_tx.withColumn('timestamp', to_timestamp(col('timestamp'), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))

# Rearrange and rename columns
df_tx = df_tx.select(['hash', 'blockNumber', 'from', 'to', 'value', 'erc721TokenId', 'erc1155_token_id', 'erc1155_value', 'tokenId', 'asset', 'category', 'timestamp'])
df_tx = df_tx.withColumnRenamed('hash', 'tx_hash').withColumnRenamed('value', 'asset_value') \
        .withColumnRenamed('from', 'from_address').withColumnRenamed('to', 'to_address') \
        .withColumnRenamed('erc721TokenId', 'erc721_token_id').withColumnRenamed('tokenId', 'token_id') \
        .withColumnRenamed('blockNumber', 'block_number')

# Handle duplicates
df_tx = df_tx.dropDuplicates()

df_blocks = df_tx.select('block_number').distinct()

df_addresses = df_tx.select('from_address').union(df_tx.select('to_address')).distinct().dropna()
df_addresses = df_addresses.withColumnRenamed('from_address', 'address')

df_categories = df_tx.select('category').distinct()
df_categories = df_categories.withColumnRenamed('category', 'category_name')

df_contracts = df_tx.select('tx_hash', 'block_number', 'from_address', 'to_address').distinct()
df_contracts = df_contracts.filter(df_contracts['to_address'].isNull())
df_contracts = df_contracts.select('tx_hash', 'block_number', 'from_address')

23/06/17 23:00:06 WARN Utils: Your hostname, NatRng-MBP.local resolves to a loopback address: 127.0.0.1; using 100.75.44.193 instead (on interface en0)
23/06/17 23:00:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/17 23:00:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [4]:
db_config = db_conn.config_gsql
# Create table
cnx, cursor = db_conn.connect_db(db_config)
db_conn.create_primary_tables(cursor)
cursor.close()
cnx.close()

db_conn.export_df_to_sql(df_blocks, 'Blocks', db_config)
db_conn.export_df_to_sql(df_addresses, 'Addresses', db_config)
db_conn.export_df_to_sql(df_categories, 'TxCategories', db_config)

23/06/17 21:02:38 WARN TaskMemoryManager: Failed to allocate a page (67108864 bytes), try again.
23/06/17 21:02:38 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
23/06/17 21:02:38 WARN TaskMemoryManager: Failed to allocate a page (67108864 bytes), try again.
23/06/17 21:02:38 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
23/06/17 21:02:39 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
23/06/17 21:02:39 WARN TaskMemoryManager: Failed to allocate a page (67108864 bytes), try again.
23/06/17 21:02:39 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
23/06/17 21:02:39 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
23/06/17 21:02:39 WARN TaskMemoryManager: Failed to allocate a page (67108864 bytes), try again.
23/06/17 21:02:39 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
23/06/17 21:02:40 WARN TaskMem

In [5]:
# Loading tables into dataframes
url = f"jdbc:mariadb://{db_config['host']}:{db_config['port']}/{db_config['database']}"
user = db_config['user']
password = db_config['password']
block_df = spark.read.format('jdbc').options(url=url, dbtable='Blocks', user=user, password=password).load()
address_df = spark.read.format('jdbc').options(url=url, dbtable='Addresses', user=user, password=password).load()
category_df = spark.read.format('jdbc').options(url=url, dbtable='TxCategories', user=user, password=password).load()

# Joining with Blocks, Addresses and TxCategories tables
df_tx = df_tx.join(block_df, df_tx["block_number"] == block_df["block_number"], 'left') \
    .select(df_tx["*"], block_df['block_id'])

df_tx = df_tx.join(address_df.alias('from_address_df'), df_tx.from_address == col("from_address_df.address"), 'left') \
    .select(df_tx["*"], col('from_address_df.address_id').alias('from_id'))

df_tx = df_tx.join(address_df.alias('to_address_df'), df_tx.to_address == col("to_address_df.address"), 'left') \
    .select(df_tx["*"], col('to_address_df.address_id').alias('to_id'))

df_tx = df_tx.join(category_df, df_tx.category == category_df.category_name, 'left') \
    .select(df_tx["*"], category_df['category_id'])

df_tx = df_tx.drop('block_number', 'from_address', 'to_address', 'category', 'address')

In [6]:
df_tx = df_tx.select(['tx_hash', 'block_id', 'from_id', 'to_id', 'asset_value', 'erc721_token_id', 'erc1155_token_id', 'erc1155_value', 'token_id', 'asset', 'category_id', 'timestamp'])
df_tx.count()

                                                                                

50678698

In [5]:
# Create table with foreign keys
cnx, cursor = db_conn.connect_db(db_config)
db_conn.create_transactions_table(cursor)
cursor.close()
cnx.close()

db_conn.export_df_to_sql(df_tx, 'Transactions', db_config)

                                                                                

In [6]:
df_contracts = df_contracts.join(address_df, df_contracts.from_address == address_df.address, 'left') \
                .select(df_contracts["*"], address_df['address_id'])
df_contracts = df_contracts.drop('from_address')

tx_df = spark.read.format('jdbc').options(url=url, dbtable='Transactions', user=user, password=password).load()
df_contracts = df_contracts.join(
                tx_df,
                (df_contracts.tx_hash == tx_df.tx_hash) & 
                (df_contracts.address_id == tx_df.from_id) &
                (tx_df.to_id.isNull()), 
                'left').select(tx_df['tx_id'])


In [7]:
cnx, cursor = db_conn.connect_db(db_config)
db_conn.create_contracts_table(cursor)
cursor.close()
cnx.close()
db_conn.export_df_to_sql(df_contracts, 'Contracts', db_config)

                                                                                

23/06/17 23:31:02 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 307506 ms exceeds timeout 120000 ms
23/06/17 23:31:02 WARN SparkContext: Killing executors is not supported by current scheduler.
