In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

def generate_transaction_data(num_transactions=10000):  # Reduced from 1,000,000
    """Generates synthetic transaction data."""

    start_date = datetime(2023, 1, 1)
    end_date = datetime.now()

    timestamps = [start_date + timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds()))) for _ in range(num_transactions)]

    data = {
        "transaction_id": range(1, num_transactions + 1),
        "timestamp": timestamps,
        "block_number": np.random.randint(1000000, 2000000, num_transactions),
        "block_hash": ["0x" + "".join(random.choices("0123456789abcdef", k=64)) for _ in range(num_transactions)],
        "from_address": ["0x" + "".join(random.choices("0123456789abcdef", k=40)) for _ in range(num_transactions)],
        "to_address": ["0x" + "".join(random.choices("0123456789abcdef", k=40)) for _ in range(num_transactions)],
        "token_address": ["0x" + "".join(random.choices("0123456789abcdef", k=40)) for _ in range(num_transactions)],
        "token_symbol": random.choices(["ETH", "USDT", "BTC", "BNB", "LTC", "XRP", "ADA", "DOGE"], k=num_transactions),
        "amount": np.random.lognormal(mean=5, sigma=2, size=num_transactions),
        "amount_usd": np.random.uniform(1000, 50000, num_transactions),
        "gas_price": np.random.exponential(scale=20, size=num_transactions),
        "gas_used": np.random.randint(21000, 500000, num_transactions),
        "transaction_type": random.choices(["deposit", "withdrawal", "transfer", "swap", "order_fill", "staking", "lending"], k=num_transactions),
        "transaction_status": random.choices(["success", "failure", "pending"], k=num_transactions, weights=[0.95, 0.03, 0.02]),
        "nonce": np.random.randint(0, 100, num_transactions),
        "input_data": ["0x" + "".join(random.choices("0123456789abcdef", k=random.randint(0, 256))) for _ in range(num_transactions)],
        "confirmation_time": np.random.uniform(10, 600, num_transactions),
        "is_large_transaction": np.random.choice([True, False], size=num_transactions, p=[0.05, 0.95]),
        "is_suspicious_transaction": np.random.choice([True, False], size=num_transactions, p=[0.01, 0.99]),
        "network_congestion": random.choices(["low", "medium", "high"], k=num_transactions),
        "order_id": np.random.randint(10000, 99999, num_transactions),
        "order_type": random.choices(["market", "limit"], k=num_transactions),
        "order_side": random.choices(["buy", "sell"], k=num_transactions),
        "pair": random.choices(["ETH/USDT", "BTC/USDT", "BNB/USDT", "LTC/USDT"], k=num_transactions),
        "liquidity_pool_address": ["0x" + "".join(random.choices("0123456789abcdef", k=40)) for _ in range(num_transactions)],
        "liquidity_pool_fee": np.random.uniform(0.001, 0.01, num_transactions),
        "staking_pool_address": ["0x" + "".join(random.choices("0123456789abcdef", k=40)) for _ in range(num_transactions)],
        "staking_duration": np.random.randint(30, 365, num_transactions),
        "lending_pool_address": ["0x" + "".join(random.choices("0123456789abcdef", k=40)) for _ in range(num_transactions)],
        "lending_rate": np.random.uniform(0.01, 0.1, num_transactions),
        "margin_type": random.choices(["isolated", "cross"], k=num_transactions),
        "leverage": np.random.choice([1, 2, 5, 10], size=num_transactions),
        "position_type": random.choices(["long", "short"], k=num_transactions),
    }

    df = pd.DataFrame(data)
    df["transaction_fee"] = df["gas_price"] * df["gas_used"]
    return df

def generate_user_data(num_users=1000):  # Reduced from 10,000
    """Generates synthetic user profile data."""

    start_date = datetime(2020, 1, 1)
    end_date = datetime.now()

    data = {
        "user_id": range(1, num_users + 1),
        "registration_date": [start_date + timedelta(days=random.randint(0, int((end_date - start_date).days))) for _ in range(num_users)],
        "kyc_status": random.choices(["verified", "unverified"], k=num_users, weights=[0.8, 0.2]),
        "country": random.choices(["US", "CN", "JP", "KR", "GB", "DE", "IN"], k=num_users),
        "device_type": random.choices(["desktop", "mobile"], k=num_users),
        "browser_type": random.choices(["Chrome", "Firefox", "Safari"], k=num_users),
        "ip_address": ["192.168." + str(random.randint(1, 255)) + "." + str(random.randint(1, 255)) for _ in range(num_users)],
        "account_balance": np.random.uniform(100, 100000, num_users),
        "trading_volume": np.random.uniform(1000, 1000000, num_users),
        "deposit_count": np.random.randint(0, 100, num_users),
        "withdrawal_count": np.random.randint(0, 50, num_users),
        "login_count": np.random.randint(1, 1000, num_users),
        "last_login_time": [start_date + timedelta(hours=random.randint(0, int((end_date - start_date).total_seconds()/3600))) for _ in range(num_users)],
        "is_active": np.random.choice([True, False], size=num_users, p=[0.7, 0.3]),
        "is_bot": np.random.choice([True, False], size=num_users, p=[0.05, 0.95]),
        "referral_id": np.random.randint(1000, 9999, num_users),
        "referral_count": np.random.randint(0, 10, num_users),
        "api_key_usage": random.choices(["yes", "no"], k=num_users),
        "average_transaction_value": np.random.uniform(100, 5000, num_users),
        "average_transaction_frequency": np.random.uniform(0.1, 10, num_users),
    }

    return pd.DataFrame(data)

def generate_market_data(num_market_data=10000):  # Reduced from 100,000
    """Generates synthetic market data."""

    start_date = datetime(2023, 1, 1)
    end_date = datetime.now()

    data = {
        "market_timestamp": [start_date + timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds()))) for _ in range(num_market_data)],
        "pair": random.choices(["ETH/USDT", "BTC/USDT", "BNB/USDT", "LTC/USDT"], k=num_market_data),
        "open_price": np.random.uniform(1000, 50000, num_market_data),
        "high_price": np.random.uniform(1000, 50000, num_market_data) * 1.05,
        "low_price": np.random.uniform(1000, 50000, num_market_data) * 0.95,
        "close_price": np.random.uniform(1000, 50000, num_market_data),
        "volume": np.random.uniform(1000000, 100000000, num_market_data),
        "market_cap": np.random.uniform(1000000000, 100000000000, num_market_data),
        "price_change_24h": np.random.uniform(-10, 10, num_market_data),
        "price_change_7d": np.random.uniform(-30, 30, num_market_data),
        "dominance": np.random.uniform(1, 50, num_market_data),
        "fear_greed_index": np.random.randint(0, 100, num_market_data),
        "funding_rate": np.random.uniform(-0.01, 0.01, num_market_data),
    }

    return pd.DataFrame(data)

def generate_network_data(num_network_data=1000):  # Reduced from 10,000
    """Generates synthetic network data."""

    start_date = datetime(2023, 1, 1)
    end_date = datetime.now()

    data = {
        "network_timestamp": [start_date + timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds()))) for _ in range(num_network_data)],
        "average_gas_price": np.random.uniform(10, 100, num_network_data),
        "average_block_time": np.random.uniform(5, 30, num_network_data),
        "network_difficulty": np.random.uniform(10000000000000, 100000000000000, num_network_data),
        "hashrate": np.random.uniform(100000000000, 1000000000000, num_network_data),
        "total_transactions": np.random.uniform(1000000, 10000000, num_network_data),
        "pending_transactions": np.random.randint(100, 10000, num_network_data),
    }

    return pd.DataFrame(data)

def save_data_to_parquet(df, filename):
    """Saves DataFrame to Parquet file."""
    df.to_parquet(filename)
    print(f"Data saved to {filename}")

if __name__ == "__main__":
    transaction_df = generate_transaction_data()
    user_df = generate_user_data()
    market_df = generate_market_data()
    network_df = generate_network_data()

    save_data_to_parquet(transaction_df, "transactions.parquet")
    save_data_to_parquet(user_df, "users.parquet")
    save_data_to_parquet(market_df, "market.parquet")
    save_data_to_parquet(network_df, "network.parquet")

Data saved to transactions.parquet
Data saved to users.parquet
Data saved to market.parquet
Data saved to network.parquet


In [3]:
import pandas as pd

def view_parquet_files(file_list):
    """Reads and displays a list of Parquet files."""
    for filename in file_list:
        try:
            df = pd.read_parquet(filename)
            print(f"\n--- Contents of {filename} ---")
            print(df.head())  # Display the first few rows
        except FileNotFoundError:
            print(f"File '{filename}' not found.")
        except Exception as e:
            print(f"An error occurred while reading {filename}: {e}")

# List of Parquet files to view
parquet_files = ["transactions.parquet", "users.parquet", "market.parquet", "network.parquet"]

# View the files
view_parquet_files(parquet_files)


--- Contents of transactions.parquet ---
   transaction_id           timestamp  block_number  \
0               1 2025-03-07 08:45:30       1973209   
1               2 2023-12-20 08:42:39       1086849   
2               3 2024-07-03 21:30:03       1876541   
3               4 2024-10-22 05:03:48       1798339   
4               5 2023-01-11 11:56:57       1618455   

                                          block_hash  \
0  0xea954b6ec4e7f74ad4c5a5fc9f07f04794d09cd67dbd...   
1  0x371667934ee5c2b6560a967cb28aea384fa65da4240d...   
2  0x50a7ff9524eb5ab5a3c477d17299c5d8282d8b0bd73f...   
3  0x8bb0e8ae58b3c27ad23f5b75ff3c697f66d9139bc22b...   
4  0x82ed2b7d2f2953cad958a11b0e235a0a0538d95be35c...   

                                 from_address  \
0  0xf1a6e356f9f2370ce8091298d1c4ca569f1d8864   
1  0xf7859f41d95f27299cfdafdeb9b63863d4d96498   
2  0x38a700806628cbe591485363bb144cd516906162   
3  0x2c7b90aa885cd995999fe2709fec0490ca1b0e55   
4  0xc3baeb8b7edad3dff33fc3a2d7c745061173b987