In [0]:
raw_bucket = "crypto-databricks-1760666848-raw-trades"

try:
    files = dbutils.fs.ls(f"s3a://{raw_bucket}/sample-data/")
    print(f"S3 access works! Found {len(files)} file(s):")
    for file in files:
        print(f" -{file.name} ({file.size:,} bytes)")
except Exception as e:
    print(f" S3 access failed (expected: {e})")

In [0]:
import pandas as pd
from datetime import datetime, timedelta
import random

NUM_TRADES = 100000
SYMBOLS = ['BTC-USD', 'ETH-USD', 'SOL-USD', 'MATIC-USD', 'AVAX-USD']
EXCHANGES = ['Coinbase', 'Binance', 'Kraken', 'Gemini']
START_DATE = datetime.now() - timedelta(days=90)
BASE_PRICES = {'BTC-USD': 45000, 'ETH-USD': 2500, 'SOL-USD': 100, 'MATIC-USD': 0.80, 'AVAX-USD': 35}

trades = []
for i in range(NUM_TRADES):
    symbol = random.choice(SYMBOLS)
    price = BASE_PRICES[symbol] * (1 + random.uniform(-0.05, 0.05))
    trades.append({
        'trade_id': f'TRADE{i:08d}',
        'symbol': symbol,
        'price': round(price, 2),
        'volume': round(random.uniform(0.001, 10.0), 8),
        'trade_type': random.choice(['buy', 'sell']),
        'exchange': random.choice(EXCHANGES),
        'timestamp': START_DATE + timedelta(seconds=random.randint(0, 90*24*60*60)),
        'user_id': f'user{random.randint(1, 1000):04d}'
    })

df = pd.DataFrame(trades)
df.to_parquet('crypto_trades_new.parquet', compression='snappy', index=False)
print(f"✓ Created {len(df):,} trades")

In [0]:
from pyspark.sql.types import *
from datetime import datetime, timedelta
import random

random.seed(42)

print("🔄 Generating crypto trades...")

# Config
NUM_TRADES = 100000
SYMBOLS = ['BTC-USD', 'ETH-USD', 'SOL-USD', 'MATIC-USD', 'AVAX-USD']
EXCHANGES = ['Coinbase', 'Binance', 'Kraken', 'Gemini']
BASE_PRICES = {'BTC-USD': 45000, 'ETH-USD': 2500, 'SOL-USD': 100, 'MATIC-USD': 0.80, 'AVAX-USD': 35}

# Schema
schema = StructType([
    StructField("trade_id", StringType(), False),
    StructField("symbol", StringType(), False),
    StructField("price", DoubleType(), False),
    StructField("volume", DoubleType(), False),
    StructField("trade_type", StringType(), False),
    StructField("exchange", StringType(), False),
    StructField("timestamp", TimestampType(), False),
    StructField("user_id", StringType(), False)
])

# Generate data
trades = []
start_date = datetime.now() - timedelta(days=90)

for i in range(NUM_TRADES):
    symbol = random.choice(SYMBOLS)
    price = BASE_PRICES[symbol] * (1 + random.uniform(-0.05, 0.05))
    trades.append((
        f"TRADE{i:08d}",
        symbol,
        round(price, 2),
        round(random.uniform(0.001, 10.0), 8),
        random.choice(['buy', 'sell']),
        random.choice(EXCHANGES),
        start_date + timedelta(seconds=random.randint(0, 90*24*60*60)),
        f"user{random.randint(1, 1000):04d}"
    ))

df = spark.createDataFrame(trades, schema)
print(f"✓ Generated {df.count():,} trades")

# Write to S3
output = "s3a://crypto-databricks-1760666848-raw-trades/trades/crypto_trades.parquet"
df.write.mode("overwrite").parquet(output)

print(f"✓ Saved to S3: {output}")
display(df.limit(10))

In [0]:
df = spark.read.parquet("s3a://crypto-databricks-1760666848-raw-trades/trades/crypto_trades.parquet")

print (f"Loaded {df.count():,} trades")
df.printSchema()
display(df.limit(10))