In [1]:
import requests
from alchemy_api import AlchemyApi
import db_connection as db_conn
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import backoff
from requests.exceptions import RequestException
import pandas as pd
import numpy as np
import threading
import pickle
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, when, lit, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, FloatType, NullType

In [2]:
alchemy = AlchemyApi()

Transaction History Methods

In [3]:
def api_key_rotation(api_keys):
    index = random.randint(0, len(api_keys) - 1)
    api_key = api_keys[index]
    return api_key

def get_block_chunks(start_block, end_block, step=3):
    ranges = []
    for i in range(start_block, end_block - step + 2, step):
        ranges.append((i, i + step - 1))
    if ranges[-1][1] < end_block:
        ranges.append((ranges[-1][1] + 1, end_block))
    return ranges

@backoff.on_exception(backoff.expo, 
                      (RequestException, KeyError), 
                      max_tries=4)
def fetch_transactions(block_range):
    transactions = []
    failed_ranges = []
    page_key = None
    while True:
        try:
            start_block_hex = alchemy.convert_block_to_hex(block_range[0])
            end__block_hex = alchemy.convert_block_to_hex(block_range[1])
            api_key = api_key_rotation(alchemy.get_api_keys())
            alchemy_url = alchemy.get_api_url() + api_key
            transfers_body = alchemy.create_asset_transfers()
            transfers_body = alchemy.set_tx_block_range(transfers_body, block_from=start_block_hex, block_to=end__block_hex)
            if page_key:
                transfers_body = alchemy.set_tx_pagination(transfers_body, page_key=page_key)
            response_tx = requests.post(alchemy_url, json=transfers_body)
            response_tx_json = response_tx.json()
            page_key = response_tx_json["result"]["pageKey"]
            transactions = [{"hash": tx["hash"], "blockNumber": tx["blockNum"], "from": tx["from"], "to": tx["to"], "value": tx["value"], "erc721TokenId": tx["erc721TokenId"], "erc1155Metadata": tx["erc1155Metadata"], 
                            "tokenId": tx["tokenId"], "asset": tx["asset"], "category": tx["category"], "timestamp": tx['metadata']['blockTimestamp']} for tx in response_tx_json["result"]["transfers"]]
            if "pageKey" not in response_tx_json["result"]:
                break
            page_key = response_tx_json["result"]["pageKey"]
        except (RequestException, KeyError) as e:
            print(f"Request failed with exception: {e}. Retrying...")
            print(f"API key used: {api_key}")
            failed_ranges.append(block_range)
            raise

    return transactions, failed_ranges

@backoff.on_exception(backoff.expo, 
                      (RequestException, KeyError), 
                      max_tries=4)
def fetch_io_transactions_by_address(address, transaction_direction):
    from_transactions = []
    failed_from_addresses = []
    page_key = None
    page_counter = 1
    while True:
        try:
            api_key = api_key_rotation(alchemy.get_api_keys())
            alchemy_url = alchemy.get_api_url() + api_key
            transfers_body = alchemy.create_asset_transfers()
            if transaction_direction == "from":
                transfers_body = alchemy.set_tx_address(transfers_body, address_from=address, address_to=None)
            else:
                transfers_body = alchemy.set_tx_address(transfers_body, address_from=None, address_to=address)
            if page_key:
                transfers_body = alchemy.set_tx_pagination(transfers_body, page_key=page_key)
            response_tx = requests.post(alchemy_url, json=transfers_body)
            response_tx_json = response_tx.json()
            transactions = [{"hash": tx["hash"], "blockNumber": tx["blockNum"], "from": tx["from"], "to": tx["to"], 
                            "value": tx["value"], "erc721TokenId": tx["erc721TokenId"], 
                            "erc1155Metadata": tx["erc1155Metadata"], "tokenId": tx["tokenId"], 
                            "asset": tx["asset"], "category": tx["category"], 
                            "timestamp": tx['metadata']['blockTimestamp']} for tx in response_tx_json["result"]["transfers"]]
            from_transactions.extend(transactions)
            
            if "pageKey" not in response_tx_json["result"]:
                break
            page_counter += 1
            page_key = response_tx_json["result"]["pageKey"]
        except (RequestException, KeyError) as e:
            print('From {} page {}: {}'.format(address, page_counter, response_tx_json))
            print(f"Request failed with exception: {e}. Retrying...")
            print(f"API key used: {api_key}")
            failed_from_addresses.append(address)
            raise

    return from_transactions, failed_from_addresses

def fetch_transactions_by_address(address):
    to_transactions, failed_to_addresses = fetch_io_transactions_by_address(address, 'to')
    from_transactions, failed_from_addresses = fetch_io_transactions_by_address(address, 'from')
    return to_transactions, from_transactions, failed_to_addresses, failed_from_addresses

Transaction History by Block Number

In [55]:
start_block = 14814062
end_block = 14881676
block_ranges = get_block_chunks(start_block, end_block, 3)

all_transactions = []
all_failed_ranges = []

lock = threading.Lock()

with ThreadPoolExecutor(max_workers=6) as executor:
    futures = [executor.submit(fetch_transactions, block_range) for block_range in block_ranges]
    for future in as_completed(futures):
        transactions, failed_ranges = future.result()
        with lock:
            all_transactions.extend(transactions)
            all_failed_ranges.extend(failed_ranges)

print(len(all_transactions))
transaction_blocks = set(tx['blockNumber'] for tx in all_transactions)
# Filter out the failed ranges that have block numbers in all transactions
all_failed_ranges = [block_range for block_range in all_failed_ranges 
                     if alchemy.convert_block_to_hex(block_range[0]) 
                     not in transaction_blocks and alchemy.convert_block_to_hex(block_range[1]) not in transaction_blocks]

In [None]:
print(all_failed_ranges)
with open('data/pickle_files/all_transactions_may_3.pickle', 'wb') as f:
    pickle.dump(all_transactions, f)

[]
[]


Transaction History by account number

In [4]:
all_addresses = pd.read_pickle('data/pickle_files/scam_users_and_contract_creators_alt.pkl')
seed_value = 42
random.seed(seed_value)
random.shuffle(all_addresses)
addresses_chunks = np.array_split(all_addresses, 3)

In [5]:
all_transactions_by_address = []
failed_addresses = []

lock = threading.Lock()

with ThreadPoolExecutor(max_workers=6) as executor:
    futures = [executor.submit(fetch_transactions_by_address, address) for address in addresses_chunks[2]]
    for future in as_completed(futures):
        to_transactions, from_transactions, failed_to_addresses, failed_from_addresses = future.result()
        with lock:
            all_transactions_by_address.extend(from_transactions)
            failed_addresses.extend(failed_from_addresses)
            all_transactions_by_address.extend(to_transactions)
            failed_addresses.extend(failed_to_addresses)

from_addresses = set(tx['from'] for tx in all_transactions_by_address)
to_addresses = set(tx['to'] for tx in all_transactions_by_address)
transaction_addresses = from_addresses.union(to_addresses)
# Filter out the failed ranges that have block numbers in all transactions
all_failed_addresses = [address for address in failed_addresses if address not in transaction_addresses]

In [6]:
len(all_transactions_by_address)

8730326

In [7]:
print(all_failed_addresses)
with open('data/pickle_files/all_transactions_scam_3_alt.pickle', 'wb') as f:
    pickle.dump(all_transactions_by_address, f)

[]


In [None]:
# df_part1 = pd.DataFrame(pd.read_pickle('data/pickle_files/all_transactions_may_1.pickle'))
# print(df_part1.shape[0])
# df_part1.to_parquet('data/parquet_files/may1_tx.parquet')
# df_part1 = None
# df_part2 = pd.DataFrame(pd.read_pickle('data/pickle_files/all_transactions_may_2.pickle'))
# print(df_part2.shape[0])
# df_part2.to_parquet('data/parquet_files/may2_tx.parquet')
# df_part2 = None
# df_part3 = pd.DataFrame(pd.read_pickle('data/pickle_files/all_transactions_may_3.pickle'))
# print(df_part3.shape[0])
# df_part3.to_parquet('data/parquet_files/may3_tx.parquet')
# df_part3 = None

20074182


In [2]:
df_part1 = pd.DataFrame(pd.read_pickle('data/pickle_files/all_transactions_scam_1_alt.pickle'))
print(df_part1.shape[0])
df_part1.to_parquet('data/parquet_files/scam_tx_1_alt.parquet')
df_part2 = pd.DataFrame(pd.read_pickle('data/pickle_files/all_transactions_scam_2_alt.pickle'))
print(df_part2.shape[0])
df_part2.to_parquet('data/parquet_files/scam_tx_2_alt.parquet')
df_part3 = pd.DataFrame(pd.read_pickle('data/pickle_files/all_transactions_scam_3_alt.pickle'))
print(df_part3.shape[0])
df_part3.to_parquet('data/parquet_files/scam_tx_3_alt.parquet')

792932
420029
8730326


In [3]:
spark = SparkSession.builder \
    .appName("process_df_tx") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "32g") \
    .getOrCreate()

23/08/04 12:44:36 WARN Utils: Your hostname, NatRng-MBP.local resolves to a loopback address: 127.0.0.1; using 10.200.168.84 instead (on interface en0)
23/08/04 12:44:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/04 12:44:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# schema of txdata
schema = StructType([
    StructField("hash", StringType()),
    StructField("blockNumber", StringType()),
    StructField("from", StringType()),
    StructField("to", StringType()),
    StructField("value", FloatType()),
    StructField("erc721TokenId", StringType()),
    StructField("erc1155Metadata", StringType()),
    StructField("tokenId", StringType()),
    StructField("asset", StringType()),
    StructField("category", StringType()),
    StructField("timestamp", StringType())
])

#merge parts
df_tx_1 = spark.read.parquet('data/parquet_files/scam_tx_1_alt.parquet')
df_tx_2 = spark.read.parquet('data/parquet_files/scam_tx_2_alt.parquet')
df_tx_3 = spark.read.parquet('data/parquet_files/scam_tx_3_alt.parquet')
df_tx = df_tx_1.union(df_tx_2).union(df_tx_3)
df_tx = df_tx.repartition(100)
num_rows = df_tx.filter(df_tx["erc1155Metadata"].isNotNull()).count()

if num_rows > 0:
    df_tx = df_tx.withColumn('erc1155_token_id', when(col('erc1155Metadata').isNotNull() & (col('erc1155Metadata').getItem(0).isNotNull()), col('erc1155Metadata').getItem(0).getItem('tokenId')).otherwise(lit(None)))
    df_tx = df_tx.withColumn('erc1155_value', when(col('erc1155Metadata').isNotNull() & (col('erc1155Metadata').getItem(0).isNotNull()), col('erc1155Metadata').getItem(0).getItem('value')).otherwise(lit(None)))
    df_tx = df_tx.drop('erc1155Metadata')
else:
    df_tx = df_tx.withColumn('erc1155_token_id', lit(None))
    df_tx = df_tx.withColumn('erc1155_value', lit(None))
    df_tx = df_tx.drop('erc1155Metadata')

#process data types
df_tx = df_tx.withColumn("value", col("value").cast("float"))
df_tx = df_tx.withColumn("erc1155_value", col("erc1155_value").cast("float"))
df_tx = df_tx.withColumn('timestamp', to_timestamp(col('timestamp'), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
df_tx = df_tx.select(['hash', 'blockNumber', 'from', 'to', 'value', 'erc721TokenId', 'erc1155_token_id', 'erc1155_value', 'tokenId', 'asset', 'category', 'timestamp'])
df_tx = df_tx.withColumnRenamed('hash', 'tx_hash').withColumnRenamed('value', 'asset_value') \
        .withColumnRenamed('from', 'from_address').withColumnRenamed('to', 'to_address') \
        .withColumnRenamed('erc721TokenId', 'erc721_token_id').withColumnRenamed('tokenId', 'token_id') \
        .withColumnRenamed('blockNumber', 'block_number')

df_tx = df_tx.dropDuplicates()

df_blocks = df_tx.select('block_number').distinct()

df_addresses = df_tx.select('from_address').union(df_tx.select('to_address')).distinct().dropna()
df_addresses = df_addresses.withColumnRenamed('from_address', 'address')

df_categories = df_tx.select('category').distinct()
df_categories = df_categories.withColumnRenamed('category', 'category_name')

df_contracts = df_tx.select('tx_hash', 'block_number', 'from_address', 'to_address').distinct()
df_contracts = df_contracts.filter(df_contracts['to_address'].isNull())
df_contracts = df_contracts.select('tx_hash', 'block_number', 'from_address')

                                                                                

In [5]:
# db_config = db_conn.config_gsql
db_config = db_conn.config_scam_alt_sql
url = f"jdbc:mariadb://{db_config['host']}:{db_config['port']}/{db_config['database']}"
user = db_config['user']
password = db_config['password']

In [6]:
cnx, cursor = db_conn.connect_db(db_config)
db_conn.create_primary_tables(cursor)
cursor.close()
cnx.close()

db_conn.export_df_to_sql(df_blocks, 'Blocks', db_config)
print('Blocks table exported')
db_conn.export_df_to_sql(df_addresses, 'Addresses', db_config)
print('Addresses table exported')
db_conn.export_df_to_sql(df_categories, 'TxCategories', db_config)
print('TxCategories table exported')

                                                                                

Blocks table exported


                                                                                

Addresses table exported




TxCategories table exported


                                                                                

In [7]:
block_df = spark.read.format('jdbc').options(url=url, dbtable='Blocks', user=user, password=password).load()
address_df = spark.read.format('jdbc').options(url=url, dbtable='Addresses', user=user, password=password).load()
category_df = spark.read.format('jdbc').options(url=url, dbtable='TxCategories', user=user, password=password).load()

df_tx = df_tx.join(block_df, df_tx["block_number"] == block_df["block_number"], 'left') \
    .select(df_tx["*"], block_df['block_id'])

df_tx = df_tx.join(address_df.alias('from_address_df'), df_tx.from_address == col("from_address_df.address"), 'left') \
    .select(df_tx["*"], col('from_address_df.address_id').alias('from_id'))

df_tx = df_tx.join(address_df.alias('to_address_df'), df_tx.to_address == col("to_address_df.address"), 'left') \
    .select(df_tx["*"], col('to_address_df.address_id').alias('to_id'))

df_tx = df_tx.join(category_df, df_tx.category == category_df.category_name, 'left') \
    .select(df_tx["*"], category_df['category_id'])

df_tx = df_tx.drop('block_number', 'from_address', 'to_address', 'category', 'address')

In [8]:
df_tx = df_tx.select(['tx_hash', 'block_id', 'from_id', 'to_id', 'asset_value', 'erc721_token_id', 'erc1155_token_id', 'erc1155_value', 'token_id', 'asset', 'category_id', 'timestamp'])
# df_tx.count()

In [9]:
df_tx = df_tx.withColumn('erc1155_token_id', col('erc1155_token_id').cast('string'))
df_tx = df_tx.withColumn('erc1155_value', col('erc1155_value').cast('string'))

df_tx = df_tx.fillna({'erc1155_token_id': '', 'erc1155_value': ''})

In [10]:
cnx, cursor = db_conn.connect_db(db_config)
db_conn.create_transactions_table(cursor)
cursor.close()
cnx.close()

db_conn.export_df_to_sql(df_tx, 'Transactions', db_config)

                                                                                

In [11]:
df_contracts = df_contracts.join(address_df, df_contracts.from_address == address_df.address, 'left') \
                .select(df_contracts["*"], address_df['address_id'])
df_contracts = df_contracts.drop('from_address')

tx_df = spark.read.format('jdbc').options(url=url, dbtable='Transactions', user=user, password=password).load()
df_contracts = df_contracts.join(
                tx_df,
                (df_contracts.tx_hash == tx_df.tx_hash) & 
                (df_contracts.address_id == tx_df.from_id) &
                (tx_df.to_id.isNull()), 
                'left').select(tx_df['tx_id'])


In [12]:
cnx, cursor = db_conn.connect_db(db_config)
db_conn.create_contracts_table(cursor)
cursor.close()
cnx.close()
db_conn.export_df_to_sql(df_contracts, 'Contracts', db_config)

                                                                                

In [13]:
scam_df = spark.read.csv('data/hoptrail_scam_dataset.csv', header=True)
all_scam_addresses = pd.read_pickle('data/pickle_files/scam_users_and_contract_creators_alt.pkl')
print(len(all_scam_addresses))
row_schema = StructType([
    StructField("address", StringType(), True),
])

all_scam_addresses_rows = [Row(address=addr) for addr in all_scam_addresses]
all_scam_addresses_df = spark.createDataFrame(all_scam_addresses_rows, schema=row_schema)
scam_df = all_scam_addresses_df.join(scam_df, on='address', how='left')

addresses_df = spark.read.format('jdbc').options(url=url, dbtable='Addresses', user=user, password=password).load()
scam_df = scam_df.join(addresses_df, on='address', how='left')

8666


In [14]:
addresses_in_db = addresses_df.select("address").rdd.flatMap(lambda x: x).collect()
missing_addresses = [addr for addr in all_scam_addresses if addr not in addresses_in_db]
with open('data/pickle_files/missing_addresses_alt.pickle', 'wb') as f:
    pickle.dump(missing_addresses, f)

scam_df = scam_df.dropna(subset=["address_id"])

                                                                                

In [15]:
cnx, cursor = db_conn.connect_db(db_config)
db_conn.create_scam_table(cursor)
cursor.close()
cnx.close()
db_conn.export_df_to_sql(scam_df, 'ScamAddresses', db_config)

                                                                                