In [1]:
import polars as pl
import pandas as pd
from tqdm.notebook import tqdm
import os

In [2]:
import sys
code_dir = os.path.realpath(os.path.join(os.getcwd(), "..", "src"))

sys.path.append(code_dir)

In [3]:
import os
data_dir = os.path.realpath(os.path.join(os.getcwd(), "..", "data")) + os.sep
print(data_dir)

/Users/johnnatan/matterlabs/research-proj-inscriptions/data/


In [4]:
from utils import Utils

In [5]:
inscriptions_df = pl.scan_parquet(
    data_dir + "inscriptions_df.parquet").limit(10).collect()
inscriptions_df.shape

(10, 11)

In [6]:
inscriptions_df

block_number,tx_hash,tx_input_data,issuer,receiver,timestamp,gas_used,gas_effective_price,fees,tx_status,decoded_input_data
i64,str,str,str,str,datetime[μs],i64,i64,f64,i64,str
6332862,"""0x2359c19cc71569daf700191b894c…","""0x646174613a2c64617368626f6172…","""0x86f04d8f599a5b56ddcd91a9689a…","""0x86f04d8f599a5b56ddcd91a9689a…",2023-06-18 02:04:06,215530,250000000,5.4e-05,1,"""data:,dashboard"""
6351363,"""0x9fb936db146658f1d43a79373426…","""0x646174613a696d6167652f706e67…","""0x39ebe965aff2f3803305db701c8e…","""0x488bd13f16a23a41344e2c11ff18…",2023-06-18 07:13:34,221181,250000000,5.5e-05,1,"""data:image/png;base64,iVBORw0K…"
6351394,"""0x364528a9e973bbc69cabc3250f99…","""0x646174613a696d6167652f706e67…","""0x39ebe965aff2f3803305db701c8e…","""0x488bd13f16a23a41344e2c11ff18…",2023-06-18 07:14:05,148688,250000000,3.7e-05,1,"""data:image/png;base64,iVBORw0K…"
6351410,"""0x0d746c0320ff460b817e974c8b3f…","""0x646174613a696d6167652f706e67…","""0x39ebe965aff2f3803305db701c8e…","""0x488bd13f16a23a41344e2c11ff18…",2023-06-18 07:14:21,220987,250000000,5.5e-05,1,"""data:image/png;base64,iVBORw0K…"
6351431,"""0xcd99fb93cee9376e985142223d52…","""0x646174613a696d6167652f706e67…","""0x39ebe965aff2f3803305db701c8e…","""0x488bd13f16a23a41344e2c11ff18…",2023-06-18 07:14:42,147604,250000000,3.7e-05,1,"""data:image/png;base64,iVBORw0K…"
6351443,"""0xa0c43eb48897c9ba8541b3fc4dc6…","""0x646174613a696d6167652f706e67…","""0x39ebe965aff2f3803305db701c8e…","""0x39ebe965aff2f3803305db701c8e…",2023-06-18 07:14:54,147716,250000000,3.7e-05,1,"""data:image/png;base64,iVBORw0K…"
6351455,"""0xee9be591cef37cb455c66b9fe12a…","""0x646174613a696d6167652f706e67…","""0x39ebe965aff2f3803305db701c8e…","""0x39ebe965aff2f3803305db701c8e…",2023-06-18 07:15:06,147681,250000000,3.7e-05,1,"""data:image/png;base64,iVBORw0K…"
6351470,"""0xee36abd218177a6961584799c1dd…","""0x646174613a696d6167652f706e67…","""0x39ebe965aff2f3803305db701c8e…","""0x39ebe965aff2f3803305db701c8e…",2023-06-18 07:15:21,220235,250000000,5.5e-05,1,"""data:image/png;base64,iVBORw0K…"
6351484,"""0x7eee16dde0c3fb5bd08df92093d4…","""0x646174613a696d6167652f706e67…","""0x39ebe965aff2f3803305db701c8e…","""0x39ebe965aff2f3803305db701c8e…",2023-06-18 07:15:35,147949,250000000,3.7e-05,1,"""data:image/png;base64,iVBORw0K…"
6351506,"""0xae6fb7f7feab5dcbdc302e018206…","""0x646174613a696d6167652f706e67…","""0x39ebe965aff2f3803305db701c8e…","""0x39ebe965aff2f3803305db701c8e…",2023-06-18 07:15:57,221269,250000000,5.5e-05,1,"""data:image/png;base64,iVBORw0K…"


In [7]:
schema = inscriptions_df.schema
schema.pop('decoded_input_data')
schema

OrderedDict([('block_number', Int64),
             ('tx_hash', String),
             ('tx_input_data', String),
             ('issuer', String),
             ('receiver', String),
             ('timestamp', Datetime(time_unit='us', time_zone=None)),
             ('gas_used', Int64),
             ('gas_effective_price', Int64),
             ('fees', Float64),
             ('tx_status', Int64)])

In [8]:
columns_renamed = {
    'tx_hash': 'tx_hash',
    'block_number': 'block_number',
    'tx_input': 'tx_input_data',
    'tx_from_address': 'issuer',
    'tx_to_address': 'receiver',
    'receipt_gas_used': 'gas_used',
    'block_timestamp': 'timestamp',
    'effective_gas_price': 'gas_effective_price',
    'receipt_status': 'tx_status',
    'value': 'amount',
    'receipt_effective_gas_price': 'gas_effective_price'
}

collumns_selected = [
    'block_number',
    'tx_hash',
    'tx_input_data',
    'issuer',
    'receiver',
    'timestamp',
    'gas_used',
    'gas_effective_price',
    'fees',
    'tx_status',
]


def rename_columns(df):
    df = df.rename(columns=columns_renamed)
    if 'gas_effective_price' not in df.columns:
        df = df.rename(columns={'gas_price': 'gas_effective_price'})
    return df


def preprocess_dataframe(df):
    df['fees'] = df['gas_used'] * df['gas_effective_price']/1e18
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df


def to_polars(df):
    return pl.DataFrame._from_pandas(df[collumns_selected], schema=schema)


def persitst_to_parquet(df, filepath):
    df.write_parquet(filepath)

In [9]:
chains_path = os.path.realpath(os.path.join(data_dir, "chains")) + os.sep
chains = sorted([chain for chain in os.listdir(chains_path)
                if os.path.isdir(chains_path+chain)])
chains.remove('avalanche')
chains

['arbitrum', 'base', 'bsc', 'ethereum', 'fantom', 'optimism', 'zksync']

In [11]:
for chain in tqdm(chains):
    print(chain)
    filenames = [filename for filename in os.listdir(
        chains_path + chain) if filename.endswith(".csv.gz")]
    if os.path.exists(chains_path + chain + "_inscriptions.parquet"):
        print("Already processed", chain)
        continue
    dfs = []
    for filename in filenames:
        filepath = chains_path + chain + os.sep + filename
        df = pd.read_csv(filepath)
        df = rename_columns(df)
        df = preprocess_dataframe(df)
        if df.shape[0] > 0:
            dfs.append(df)
    dfs = pd.concat(dfs)
    df = to_polars(dfs)
    df = df.with_columns(
        pl.col('tx_input_data').map_elements(Utils.decode_input_data,
                                             return_dtype=pl.String)
    )
    df = df.rename({'tx_input_data': 'decoded_input_data'})
    persitst_to_parquet(df, filepath=chains_path +
                        chain + "_inscriptions.parquet")

  0%|          | 0/7 [00:00<?, ?it/s]

arbitrum
base
bsc
ethereum
fantom
optimism
zksync
