# Project Inscriptions -- Data set construction

**Johnnatan Messias**, May 2024


In [1]:
import os
import pandas as pd
import polars as pl
import web3

In [2]:
import sys
code_dir = os.path.realpath(os.path.join(os.getcwd(), "..", "src"))

sys.path.append(code_dir)

In [3]:
from utils import Utils
utils = Utils(
    zkSync_data_dir='/Users/johnnatan/matterlabs/zkSync-node-crawler/data/parquet_files/')

In [4]:
# Existing dataset dir
data_dir = '../data/'

# Existing plots dir
plots_dir = data_dir+'/plots/'
os.makedirs(data_dir, exist_ok=True)
os.makedirs(plots_dir, exist_ok=True)

In [5]:
web3.Web3.to_text(
    '0x646174613a2c7b2270223a227a72632d3230222c226f70223a226d696e74222c227469636b223a2273796e63222c22616d74223a2234227d')

'data:,{"p":"zrc-20","op":"mint","tick":"sync","amt":"4"}'

In [6]:
inscriptions_tag = '0x646174613a'

In [7]:
verified_contracts_df = pd.read_csv(data_dir+'verified-contracts.csv')
verified_contracts_map = verified_contracts_df.set_index('contractAddress')[
    'contractName'].to_dict()
verified_contracts_df.head(1)

Unnamed: 0,contractAddress,codeFormat,contractName,compilerZksolcVersion,compilerSolcVersion,optimizationUsed,optimizerMode,constructorArguments,isSystem,compilerZkvyperVersion,compilerVyperVersion
0,0x5500052b962685a86217fc37107425ef32c1ff20,solidity-single-file,Contracts/Greeter.sol:GreeterOneFour,v1.3.13,0.8.17,True,,0x00000000000000000000000000000000000000000000...,False,,


In [8]:
contracts_df = pd.read_csv(data_dir+'contracts.csv')
contracts_map = contracts_df.set_index('l2Address').to_dict(orient='index')
contracts_df.head()

Unnamed: 0,name,symbol,decimals,l2Address,l1Address
0,Ether,ETH,18,0x000000000000000000000000000000000000800a,0x0000000000000000000000000000000000000000
1,ChainLink Token,LINK,18,0x082fade8b84b18c441d506e1d3a43a387cc59d20,0x514910771af9ca656af840dff83e8264ecf986ca
2,Wrapped BTC,WBTC,8,0xbbeb516fb02a01611cbbe0453fe3c580d7281011,0x2260fac5e5542a773aa44fbcfedf7c193bc2c599
3,Matic Token,MATIC,18,0x770e221ec6f3e8a2e2e168399bb3aa56a63e397d,0x7d1afa7b718fb893db30a3abc0cfc608aacfebb0
4,Uniswap,UNI,18,0x1c6f53185061d7cc387e481c350ad00c2c876f3e,0x1f9840a85d5af5bf1d1762f925bdaddc4201f984


In [9]:
address_to_name_dict = dict(
    map(lambda x: (x[0], x[1]['name']), contracts_map.items()))

In [10]:
n_transactions = utils.get_num_transactions()
n_blocks = utils.get_num_blocks()
block_info = utils.get_min_max_blocks()
print(
    f"There are {n_transactions} transactions and {n_blocks} blocks in our dataset.")
print(
    f"Blocks range from {block_info['min_number'][0]} ({block_info['min_timestamp'][0]}) to {block_info['max_number'][0]} ({block_info['max_timestamp'][0]})")

There are 328694216 transactions and 29800000 blocks in our dataset.
Blocks range from 1 (2023-02-14 14:22:22) to 29799999 (2024-03-25 02:21:27)


## Data gathering


### Filter transactions of interest


In [11]:
# Gathering transactions that contains inscriptions
txs_df = utils.get_txs(inscriptions_tag)
txs_df.shape

(17054466, 6)

In [12]:
txs_df.shape

(17054466, 6)

In [13]:
# Gathering unique issuers' wallet addresses from transactions
wallet_addresses = txs_df['issuer'].unique()
wallet_addresses.shape

(470864,)

In [14]:
# Gathering receipts for the issuers
receipts_df = utils.get_receipts(wallet_addresses)
receipts_df.shape

(62698179, 5)

In [15]:
# Merging the transactions and receipts dataframes
inscriptions_df = txs_df.join(receipts_df, on='tx_hash', how='left').sort(
    pl.col(['block_number']))

In [16]:
inscriptions_df = inscriptions_df.with_columns(pl.col('tx_input_data').map_elements(
    Utils.decode_input_data).alias('decoded_input_data'))
print("There are {} inscriptions in our dataset.".format(
    inscriptions_df.shape[0]))

There are 17060492 inscriptions in our dataset.


In [17]:
inscriptions_df.head()

block_number,tx_hash,tx_input_data,issuer,receiver,timestamp,gas_used,gas_effective_price,fees,tx_status,decoded_input_data
i64,str,str,str,str,datetime[μs],i64,i64,f64,i64,str
6332862,"""0x2359c19cc715…","""0x646174613a2c…","""0x86f04d8f599a…","""0x86f04d8f599a…",2023-06-18 02:04:06,215530,250000000,5.4e-05,1,"""data:,dashboar…"
6351363,"""0x9fb936db1466…","""0x646174613a69…","""0x39ebe965aff2…","""0x488bd13f16a2…",2023-06-18 07:13:34,221181,250000000,5.5e-05,1,"""data:image/png…"
6351394,"""0x364528a9e973…","""0x646174613a69…","""0x39ebe965aff2…","""0x488bd13f16a2…",2023-06-18 07:14:05,148688,250000000,3.7e-05,1,"""data:image/png…"
6351410,"""0x0d746c0320ff…","""0x646174613a69…","""0x39ebe965aff2…","""0x488bd13f16a2…",2023-06-18 07:14:21,220987,250000000,5.5e-05,1,"""data:image/png…"
6351431,"""0xcd99fb93cee9…","""0x646174613a69…","""0x39ebe965aff2…","""0x488bd13f16a2…",2023-06-18 07:14:42,147604,250000000,3.7e-05,1,"""data:image/png…"


In [18]:
# Persisting inscriptions dataframe to a file
print(inscriptions_df.shape)
inscriptions_df.write_parquet(data_dir+'inscriptions_df.parquet')

(17060492, 11)


In [19]:
min_block, max_block = inscriptions_df['block_number'].min(
), inscriptions_df['block_number'].max()
q_1 = (pl.scan_parquet(utils.data_path['transactions'])
       .filter(pl.col('blockNumber').is_between(min_block, max_block))
       .select([
           pl.col('blockNumber').alias('block_number'),
           pl.col('hash').alias('tx_hash'),
           pl.col('from').eq(pl.col('to')).alias('is_self_transfer'),
           pl.col('hash').is_in(
               inscriptions_df['tx_hash'].unique()).alias('is_inscription')
       ])
       # ).with_columns(
       #     pl.col('tx_hash').is_in(
       #         inscriptions_df['tx_hash'].unique()).alias('is_inscription')
       )
q_2 = (pl.scan_parquet(utils.data_path['blocks'])
       .filter(pl.col('number').is_between(min_block, max_block))
       .select(pl.col('number').alias('block_number'), pl.from_epoch(pl.col('timestamp')))
       )
q = q_1.join(q_2, left_on='block_number', right_on='block_number', how='left')
all_txs_df = q.collect(streaming=True)

print(all_txs_df.shape)
all_txs_df.head()

(289466141, 5)


block_number,tx_hash,is_self_transfer,is_inscription,timestamp
i64,str,bool,bool,datetime[μs]
10000000,"""0x60b1dd4432b7…",False,False,2023-07-31 06:11:24
10000000,"""0x50225618b693…",False,False,2023-07-31 06:11:24
10000000,"""0xc9eca17d5877…",False,False,2023-07-31 06:11:24
10000000,"""0x56d6d32deef0…",False,False,2023-07-31 06:11:24
10000000,"""0x6dd263ae54dd…",False,False,2023-07-31 06:11:24


In [20]:
print("There are {} unique transactions in our dataset.".format(
    all_txs_df['tx_hash'].n_unique()))
print("There are {} unique blocks in our dataset.".format(
    all_txs_df['block_number'].n_unique()))
print("The average number of txs per block is {} txs.".format(round(
    all_txs_df['tx_hash'].n_unique()/all_txs_df['block_number'].n_unique(), 2)))

print("The minimum and max number of txs per block are: {} and {}.".format(
    all_txs_df['block_number'].min(), all_txs_df['block_number'].max()))

print("The minimum timestamp is {} and the maximum timestamp is {}.".format(
    all_txs_df['timestamp'].min(), all_txs_df['timestamp'].max()))

There are 289466141 unique transactions in our dataset.
There are 23070883 unique blocks in our dataset.
The average number of txs per block is 12.55 txs.
The minimum and max number of txs per block are: 6332862 and 29799866.
The minimum timestamp is 2023-06-18 02:04:06 and the maximum timestamp is 2024-03-25 02:19:14.


In [21]:
print("There are {} unique transactions in our dataset.".format(
    all_txs_df['tx_hash'].n_unique()))
print("There are {} unique blocks in our dataset.".format(
    all_txs_df['block_number'].n_unique()))
print("The average number of txs per block is {} txs.".format(round(
    all_txs_df['tx_hash'].n_unique()/all_txs_df['block_number'].n_unique(), 2)))

print("The minimum and max number of txs per block are: {} and {}.".format(
    all_txs_df['block_number'].min(), all_txs_df['block_number'].max()))

print("The minimum timestamp is {} and the maximum timestamp is {}.".format(
    all_txs_df['timestamp'].min(), all_txs_df['timestamp'].max()))

There are 289466141 unique transactions in our dataset.
There are 23070883 unique blocks in our dataset.
The average number of txs per block is 12.55 txs.
The minimum and max number of txs per block are: 6332862 and 29799866.
The minimum timestamp is 2023-06-18 02:04:06 and the maximum timestamp is 2024-03-25 02:19:14.


In [22]:
all_txs_df.write_parquet(data_dir+'inscriptions_all_txs_df.parquet')