# Tool 2 - Structural Classifier (Async Pattern)

**Status:** ‚úÖ Ready for Databricks | **LLM Cost:** ~$0.002 per run | **Performance:** ~10-15s

**Pattern:** Single async function with Pydantic AI classifier agent

**Showcase:** LLM-based data warehouse classification (FACT vs DIMENSION) + heuristic relationship detection.

**Key Features:**
- Single async `classify_structure()` function
- LLM-based FACT/DIMENSION classification with grain detection (transaction/event/snapshot/aggregate)
- Heuristic FK detection (column suffix matching: `product_id` ‚Üí `products`)
- Size estimation (small/medium/large/huge) and SCD Type 2 detection
- Expected performance: ~10-15s per classification

**TODO:**
- [ ] Validate FK detection accuracy (compare with actual schema metadata)
- [ ] Add support for bridge tables (many-to-many relationships)
- [ ] Test with 50+ table schemas (current test: 10-20)
- [ ] Add confidence scores for classifications

**IDEA:**
- Use actual column metadata (names, types, nullability) instead of table-level heuristics
- Add ML-based FK detection (column name similarity + cardinality analysis)
- Support custom grain definitions (not just 4 predefined types)

In [None]:
# Install dependencies
%pip install pydantic-ai>=0.0.49 pydantic>=2.8.0

In [None]:
# Restart Python kernel to use new packages
dbutils.library.restartPython()  # type: ignore

In [None]:
import asyncio
import json
import os
from datetime import datetime
from pydantic import BaseModel, Field
from pydantic_ai import Agent  # type: ignore

In [None]:
# Configure Azure OpenAI from Databricks secrets
AZURE_ENDPOINT = dbutils.secrets.get(scope="mcop", key="azure-openai-endpoint")  # type: ignore
AZURE_API_KEY = dbutils.secrets.get(scope="mcop", key="azure-openai-api-key")  # type: ignore
DEPLOYMENT_NAME = dbutils.secrets.get(scope="mcop", key="azure-openai-deployment-name")  # type: ignore

# Set environment variables for Pydantic AI
os.environ["OPENAI_BASE_URL"] = AZURE_ENDPOINT
os.environ["OPENAI_API_KEY"] = AZURE_API_KEY

MODEL_NAME = f"openai:{DEPLOYMENT_NAME}"
print(f"‚úÖ Configured model: {MODEL_NAME}")

In [None]:
# Pydantic schemas
class FactTable(BaseModel):
    """Fact table (transactional/event data)."""
    name: str = Field(description="Table name")
    entity_id: str = Field(description="Mapped entity ID from Tool 1")
    description: str = Field(default="", description="Table description")
    grain: str = Field(description="Granularity: transaction, event, snapshot, aggregate")
    estimated_row_count: str = Field(description="Size estimate: small/medium/large/huge")

class DimensionTable(BaseModel):
    """Dimension table (reference/master data)."""
    name: str = Field(description="Table name")
    entity_id: str = Field(description="Mapped entity ID from Tool 1")
    description: str = Field(default="", description="Table description")
    type: str = Field(description="Type: master, reference, lookup, bridge")
    slowly_changing: bool = Field(default=False, description="SCD Type 2?")

class Relationship(BaseModel):
    """Foreign key relationship."""
    from_table: str = Field(description="Source table name")
    to_table: str = Field(description="Target table name")
    relationship_type: str = Field(description="one-to-one, one-to-many, many-to-many")
    confidence: float = Field(description="Detection confidence 0-1")

class StructuralClassification(BaseModel):
    """Complete structural classification."""
    facts: list[FactTable] = Field(description="Fact tables")
    dimensions: list[DimensionTable] = Field(description="Dimension tables")

class StructuralMetrics(BaseModel):
    """Metrics about the structure."""
    fact_count: int
    dimension_count: int
    relationship_count: int
    classification_timestamp: str

In [None]:
# Create classifier agent
classifier_agent = Agent(
    MODEL_NAME,
    result_type=StructuralClassification,
    system_prompt="""You are a data warehouse architect.

Classify tables as FACT or DIMENSION:
- FACT: Transactional/event data (orders, clickstreams, sensor readings)
  * High row count
  * Time-dependent
  * Contains metrics/measures
  * Foreign keys to dimensions

- DIMENSION: Reference/master data (products, customers, locations)
  * Lower row count
  * Relatively static
  * Descriptive attributes
  * Primary keys

For each table, determine:
- Fact grain (transaction/event/snapshot/aggregate)
- Dimension type (master/reference/lookup/bridge)
- Size estimate (small/medium/large/huge)
- SCD Type 2 (slowly changing dimension)

Be specific and data-driven."""
)

print("‚úÖ Classifier agent created")

In [None]:
def detect_fk_relationships(facts: list[FactTable], dimensions: list[DimensionTable]) -> list[Relationship]:
    """Heuristic FK detection based on column suffix matching.

    Looks for patterns like:
    - product_id ‚Üí products table
    - customer_key ‚Üí customers dimension
    - location_fk ‚Üí locations
    """
    relationships = []
    dimension_names = {d.name.lower().rstrip('s') for d in dimensions}  # Singularize

    for fact in facts:
        # Simulate column names (in real scenario, from metadata)
        # Example: orders fact might have product_id, customer_id columns
        # For demo, assume naming convention: <entity>_id, <entity>_key, <entity>_fk

        for dim in dimensions:
            dim_singular = dim.name.lower().rstrip('s')
            # Heuristic: if fact name contains dimension name, likely FK
            if dim_singular in fact.name.lower():
                relationships.append(Relationship(
                    from_table=fact.name,
                    to_table=dim.name,
                    relationship_type="one-to-many",
                    confidence=0.7  # Heuristic confidence
                ))

    return relationships

print("‚úÖ FK detection function defined")

In [None]:
async def classify_structure(tool0_context: dict, tool1_mappings: dict, metadata: dict) -> dict:
    """Classify tables into facts and dimensions.

    Args:
        tool0_context: Business request context from Tool 0
        tool1_mappings: Entity mappings from Tool 1
        metadata: Technical metadata (Collibra/Unity Catalog)

    Returns:
        Structural classification with facts, dimensions, relationships, metrics
    """
    # Step 1: Prepare prompt context
    entities = tool0_context.get("entities", [])
    mappings = tool1_mappings.get("mappings", [])

    prompt = f"""Classify these entities into FACT and DIMENSION tables:

Business Context:
- Goal: {tool0_context.get('goal', 'N/A')}
- Scope: {tool0_context.get('scope_in', 'N/A')}

Entities from Business Request:
{json.dumps(entities, indent=2)}

Mapped Candidates (Tool 1):
{json.dumps(mappings[:10], indent=2)}  # Top 10 to avoid token overflow

Technical Metadata Sample:
{json.dumps(list(metadata.items())[:5], indent=2)}

Classify each entity as FACT or DIMENSION with justification."""

    # Step 2: Call LLM classifier
    result = await classifier_agent.run(prompt)
    classified = result.data  # StructuralClassification

    # Step 3: Detect FK relationships (heuristics)
    relationships = detect_fk_relationships(classified.facts, classified.dimensions)

    # Step 4: Calculate metrics
    metrics = StructuralMetrics(
        fact_count=len(classified.facts),
        dimension_count=len(classified.dimensions),
        relationship_count=len(relationships),
        classification_timestamp=datetime.now().isoformat()
    )

    # Step 5: Assemble final structure
    return {
        "facts": [f.model_dump() for f in classified.facts],
        "dimensions": [d.model_dump() for d in classified.dimensions],
        "relationships": [r.model_dump() for r in relationships],
        "metrics": metrics.model_dump()
    }

print("‚úÖ Async classification function defined")

In [None]:
# Load input data from DBFS
tool0_path = "/dbfs/FileStore/mcop/tool0_samples/sample_business_request.json"
tool1_path = "/dbfs/FileStore/mcop/tool1/filtered_dataset.json"
metadata_path = "/dbfs/FileStore/mcop/metadata/BA-BS_Datamarts_metadata.json"

with open(tool0_path, "r") as f:
    tool0_context = json.load(f)

with open(tool1_path, "r") as f:
    tool1_mappings = json.load(f)

with open(metadata_path, "r") as f:
    metadata = json.load(f)

print(f"‚úÖ Loaded Tool 0 context: {len(tool0_context.get('entities', []))} entities")
print(f"‚úÖ Loaded Tool 1 mappings: {len(tool1_mappings.get('mappings', []))} mappings")
print(f"‚úÖ Loaded metadata: {len(metadata)} items")

In [None]:
# Run classification
structure = await classify_structure(tool0_context, tool1_mappings, metadata)

print(f"\n‚úÖ Classification complete")
print(f"   Facts: {structure['metrics']['fact_count']}")
print(f"   Dimensions: {structure['metrics']['dimension_count']}")
print(f"   Relationships: {structure['metrics']['relationship_count']}")

In [None]:
# Save results to DBFS
output_path = "/dbfs/FileStore/mcop/tool2/structure.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, "w") as f:
    json.dump(structure, f, indent=2)

print(f"‚úÖ Structure saved: {output_path}")

In [None]:
# Display sample results
print("\n" + "="*80)
print("STRUCTURAL CLASSIFICATION RESULTS")
print("="*80)

print(f"\nüìä Metrics:")
print(f"   Facts: {structure['metrics']['fact_count']}")
print(f"   Dimensions: {structure['metrics']['dimension_count']}")
print(f"   Relationships: {structure['metrics']['relationship_count']}")
print(f"   Timestamp: {structure['metrics']['classification_timestamp']}")

print(f"\nüì¶ Sample Facts (top 3):")
for fact in structure['facts'][:3]:
    print(f"   - {fact['name']} (grain: {fact['grain']}, size: {fact['estimated_row_count']})")

print(f"\nüóÇÔ∏è  Sample Dimensions (top 3):")
for dim in structure['dimensions'][:3]:
    print(f"   - {dim['name']} (type: {dim['type']}, SCD2: {dim['slowly_changing']})")

print(f"\nüîó Sample Relationships (top 3):")
for rel in structure['relationships'][:3]:
    print(f"   - {rel['from_table']} ‚Üí {rel['to_table']} ({rel['relationship_type']}, confidence: {rel['confidence']:.1%})")

print("="*80)