# Processing Logs Explorer

Quick analysis of InfoTransform processing logs from SQLite database.

In [4]:
import sqlite3
import pandas as pd
from pathlib import Path

# Database path
DB_PATH = Path("../data/processing_logs.db")


def query_db(sql, params=None):
    """Execute query and return DataFrame"""
    conn = sqlite3.connect(str(DB_PATH))
    df = pd.read_sql_query(sql, conn, params=params)
    conn.close()
    return df


print(f"✅ Database: {DB_PATH.absolute()}")
print(f"✅ Exists: {DB_PATH.exists()}")
if DB_PATH.exists():
    print(f"✅ Size: {DB_PATH.stat().st_size / 1024:.2f} KB")

✅ Database: /Users/owen/Desktop/dev_projects/InfoTransform/backend/infotransform/notebooks/../data/processing_logs.db
✅ Exists: True
✅ Size: 28.00 KB


## Overall Statistics

In [5]:
# Summary stats
summary = query_db("""
    SELECT 
        COUNT(*) as total_runs,
        SUM(total_files) as total_files,
        SUM(successful_files) as successful,
        SUM(failed_files) as failed,
        SUM(total_tokens) as total_tokens,
        AVG(duration_seconds) as avg_duration
    FROM processing_runs
    WHERE status = 'completed'
""")

summary

Unnamed: 0,total_runs,total_files,successful,failed,total_tokens,avg_duration
0,1,1,1,0,3174,5.753479


## Recent Runs

In [6]:
# Last 10 runs
recent = query_db("""
    SELECT 
        run_id,
        start_timestamp,
        model_key,
        total_files,
        successful_files,
        total_tokens,
        duration_seconds,
        status
    FROM processing_runs
    ORDER BY start_timestamp DESC
    LIMIT 10
""")

recent

Unnamed: 0,run_id,start_timestamp,model_key,total_files,successful_files,total_tokens,duration_seconds,status
0,0ca2cde3-908b-4e19-b7cf-302ce55447de,2025-10-16T15:41:06.628918+00:00,document_metadata,1,1,3174,5.753479,completed


## Token Usage by Model

In [8]:
# Token usage per model
tokens_by_model = query_db("""
    SELECT 
        model_key,
        COUNT(*) as runs,
        SUM(total_tokens) as total_tokens,
        AVG(total_tokens) as avg_tokens,
        SUM(input_tokens) as input_tokens,
        SUM(output_tokens) as output_tokens
    FROM processing_runs
    WHERE status = 'completed'
    GROUP BY model_key
    ORDER BY total_tokens DESC
""")

tokens_by_model

Unnamed: 0,model_key,runs,total_tokens,avg_tokens,input_tokens,output_tokens
0,document_metadata,1,3174,3174.0,2993,181


## Performance Metrics

In [9]:
# Performance by model
performance = query_db("""
    SELECT 
        model_key,
        COUNT(*) as runs,
        AVG(duration_seconds) as avg_duration,
        AVG(total_files) as avg_files,
        ROUND(AVG(CAST(successful_files AS FLOAT) / NULLIF(total_files, 0) * 100), 2) as success_rate
    FROM processing_runs
    WHERE status = 'completed'
    GROUP BY model_key
""")

performance

Unnamed: 0,model_key,runs,avg_duration,avg_files,success_rate
0,document_metadata,1,5.753479,1.0,100.0


## Daily Statistics

In [10]:
# Last 7 days
daily = query_db("""
    SELECT 
        DATE(start_timestamp) as date,
        COUNT(*) as runs,
        SUM(total_files) as files,
        SUM(total_tokens) as tokens
    FROM processing_runs
    WHERE status = 'completed'
      AND start_timestamp >= datetime('now', '-7 days')
    GROUP BY DATE(start_timestamp)
    ORDER BY date DESC
""")

daily

Unnamed: 0,date,runs,files,tokens
0,2025-10-16,1,1,3174


## Cost Estimation

In [11]:
# Cost estimate (update pricing as needed)
INPUT_PRICE_PER_1M = 0.15  # $0.15 per 1M input tokens
OUTPUT_PRICE_PER_1M = 0.60  # $0.60 per 1M output tokens

cost_df = tokens_by_model.copy()
cost_df["input_cost"] = (cost_df["input_tokens"] / 1_000_000) * INPUT_PRICE_PER_1M
cost_df["output_cost"] = (cost_df["output_tokens"] / 1_000_000) * OUTPUT_PRICE_PER_1M
cost_df["total_cost"] = cost_df["input_cost"] + cost_df["output_cost"]

print(f"Total Estimated Cost: ${cost_df['total_cost'].sum():.4f}")
cost_df[["model_key", "runs", "total_tokens", "total_cost"]]

Total Estimated Cost: $0.0006


Unnamed: 0,model_key,runs,total_tokens,total_cost
0,document_metadata,1,3174,0.000558


## Export Data

In [12]:
# Export all completed runs to CSV
all_runs = query_db(
    "SELECT * FROM processing_runs WHERE status = 'completed' ORDER BY start_timestamp DESC"
)
all_runs.to_csv("processing_logs_export.csv", index=False)
print(f"✅ Exported {len(all_runs)} runs to processing_logs_export.csv")

✅ Exported 1 runs to processing_logs_export.csv
