# GAO Fraud Ontology - SHACL Validation

This notebook validates the GAO fraud ontology instance data against SHACL shapes.

**Phase 1: Foundation Classes**
- FraudActivity
- FederalAgency
- FederalUnit
- ProgramArea
- FundingStream
- RevenueStream

## Setup

In [None]:
# Install pyshacl if not already installed
!pip install pyshacl rdflib pandas -q

In [None]:
import sys
sys.path.append('/home/claude')

from validate_ontology import SHACLValidator
import pandas as pd
from pathlib import Path

# Set display options for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', None)

## Configuration

Update these paths to match your file locations:

In [None]:
# File paths - UPDATE THESE
DATA_FILE = "/home/claude/gfo_turtle.ttl"  # Your ontology file
SHAPES_FILE = "/home/claude/phase1_foundation_shapes.ttl"  # Phase 1 shapes
OUTPUT_DIR = "/home/claude/validation_reports"

# Validation settings
INFERENCE = 'none'  # Options: 'none', 'rdfs', 'owlrl'

# Check files exist
assert Path(DATA_FILE).exists(), f"Data file not found: {DATA_FILE}"
assert Path(SHAPES_FILE).exists(), f"Shapes file not found: {SHAPES_FILE}"

print("âœ“ Configuration complete")
print(f"  Data: {DATA_FILE}")
print(f"  Shapes: {SHAPES_FILE}")
print(f"  Inference: {INFERENCE}")

## Run Validation

This will validate your ontology and produce detailed reports.

In [None]:
# Create validator
validator = SHACLValidator(
    data_file=DATA_FILE,
    shapes_file=SHAPES_FILE,
    output_dir=OUTPUT_DIR
)

# Run full validation
conforms, results, summary = validator.run_full_validation(
    inference=INFERENCE,
    save_reports=True
)

## Detailed Results Analysis

In [None]:
# Convert results to DataFrame for analysis
if results:
    df_results = pd.DataFrame(results)
    print(f"Total issues: {len(df_results)}")
    df_results.head(20)
else:
    print("No validation issues found!")

### Filter by Severity

In [None]:
if results:
    # Show only Violations
    violations = df_results[df_results['severity'] == 'Violation']
    print(f"\nViolations: {len(violations)}")
    violations.head(20)
else:
    print("No violations!")

In [None]:
if results:
    # Show only Warnings
    warnings = df_results[df_results['severity'] == 'Warning']
    print(f"\nWarnings: {len(warnings)}")
    warnings.head(20)
else:
    print("No warnings!")

### Analyze by Class Type

In [None]:
if results:
    # Extract class name from focus node
    df_results['class_name'] = df_results['focus_node'].str.split('/').str[-1]
    
    # Count issues by class
    class_issues = df_results.groupby(['class_name', 'severity']).size().unstack(fill_value=0)
    class_issues['Total'] = class_issues.sum(axis=1)
    class_issues = class_issues.sort_values('Total', ascending=False)
    
    print("\nIssues by Class Type:")
    print(class_issues)

### Analyze by Property

In [None]:
if results:
    # Count issues by property
    property_issues = df_results.groupby(['result_path', 'severity']).size().unstack(fill_value=0)
    property_issues['Total'] = property_issues.sum(axis=1)
    property_issues = property_issues.sort_values('Total', ascending=False)
    
    print("\nIssues by Property:")
    print(property_issues)

## Export Specific Issues for Fixing

Export violations for a specific property to a CSV file for easy fixing.

In [None]:
if results:
    # Example: Export all missing label violations
    missing_labels = df_results[
        (df_results['result_path'] == 'label') & 
        (df_results['severity'] == 'Violation')
    ]
    
    if not missing_labels.empty:
        output_file = Path(OUTPUT_DIR) / "missing_labels.csv"
        missing_labels[['focus_node', 'focus_node_label', 'message']].to_csv(
            output_file, 
            index=False
        )
        print(f"âœ“ Exported {len(missing_labels)} missing label issues to: {output_file}")
    else:
        print("No missing label issues found!")

## Quick Stats for Each Class

In [None]:
# List all classes being validated
from rdflib import Namespace

SH = Namespace("http://www.w3.org/ns/shacl#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

# Query for target classes in shapes
target_classes_query = """
    PREFIX sh: <http://www.w3.org/ns/shacl#>
    SELECT DISTINCT ?targetClass
    WHERE {
        ?shape sh:targetClass ?targetClass .
    }
"""

stats = []
for row in validator.shapes_graph.query(target_classes_query):
    target_class = row.targetClass
    class_name = str(target_class).split('/')[-1]
    
    # Count instances
    count_query = f"""
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        SELECT (COUNT(DISTINCT ?instance) as ?count)
        WHERE {{
            ?instance rdf:type <{target_class}> .
        }}
    """
    
    count_result = list(validator.data_graph.query(count_query))
    instance_count = int(count_result[0][0]) if count_result else 0
    
    # Count issues for this class
    if results:
        class_issues_count = len([r for r in results if class_name in r['focus_node']])
    else:
        class_issues_count = 0
    
    stats.append({
        'Class': class_name,
        'Instances': instance_count,
        'Issues': class_issues_count,
        'Clean': 'âœ“' if class_issues_count == 0 else 'âœ—'
    })

df_stats = pd.DataFrame(stats)
print("\nValidation Statistics by Class:")
df_stats

## Summary

Run this cell for a final summary:

In [None]:
print("="*80)
print("VALIDATION SUMMARY")
print("="*80)
print(f"Overall Conforms: {'âœ“ YES' if conforms else 'âœ— NO'}")
print(f"Total Issues: {summary['total_issues']}")
if summary['total_issues'] > 0:
    print("\nBy Severity:")
    for severity, count in summary['by_severity'].items():
        print(f"  {severity}: {count}")
print("="*80)

if summary['total_issues'] == 0:
    print("\nðŸŽ‰ Congratulations! All Phase 1 validation checks passed!")
else:
    print(f"\nðŸ“Š Detailed reports saved to: {OUTPUT_DIR}/")
    print("\nNext steps:")
    print("  1. Review violations first (highest priority)")
    print("  2. Fix data issues in your ontology")
    print("  3. Re-run validation to verify fixes")
    print("  4. Move on to warnings and info items")