In [2]:
# GAO Ontology Development

## File Loading Tests
### Testing different libraries and approaches for loading the ontology.

## Debugging TTL Issues
### Investigating the line 11 parsing error

## Adding New Content
### Creating instances, classes, and properties

## SPARQL Queries
### Running queries against the ontology.

In [3]:
import owlready2 as owl
import rdflib
import pandas as pd
print ("Libraries loaded successfully!")

Libraries loaded successfully!


In [7]:
# Test RDF/XML file (the one that worked in streamlit)
print("=== Testing RDF/XML Format ===")
g = rdflib.Graph()
try:
    g.parse("app/ontologies/gfo_rdf_xml.rdf", format="xml")
    print(f"[OK] RDF/XML loaded successfully: {len(g)} triples") 

SyntaxError: incomplete input (3972316527.py, line 6)

In [8]:
# Test RDF/XML file (the one that worked in streamlit)
print("=== Testing RDF/XML Format ===")
g = rdflib.Graph()
try:
    g.parse("app/ontologies/gfo_rdf_xml.rdf", format="xml")
    print(f"[OK] RDF/XML loaded successfully: {len(g)} triples")

SyntaxError: incomplete input (2455327125.py, line 6)

In [13]:
# Test RDF/XML file
print("=== Testing RDF/XML Format ===")
g = rdflib.Graph()
try:
    g.parse("/app/ontologies/gfo_rdf_xml.rdf", format="xml")
    print(f"[OK] RDF/XML loaded successfully: {len(g)} triples")
except Exception as e:
    print(f"[ERROR] RDF/XML error: {e}")

=== Testing RDF/XML Format ===
[OK] RDF/XML loaded successfully: 41927 triples


In [14]:
# Get basic info about the ontology
print(f"\nBasic statistics:")
print(f"Total triples: {len(g)}")

# Show a few sample triples
print(f"\nFirst 5 triples:")
for i, (subject, predicate, object) in enumerate(g):
    if i < 5:
        print(f"  {i+1}. {subject} -> {predicate} -> {object}")
    else:
        break

# Get unique subjects, predicates, objects counts
subjects = set(g.subjects())
predicates = set(g.predicates()) 
objects = set(g.objects())

print(f"\nUnique counts:")
print(f"  Subjects: {len(subjects)}")
print(f"  Predicates: {len(predicates)}")
print(f"  Objects: {len(objects)}")


Basic statistics:
Total triples: 41927

First 5 triples:
  1. https://gaoinnovations.gov/antifraud_resource/howfraudworks/gfo/AbusiveTaxStructure -> http://www.w3.org/2000/01/rdf-schema#subClassOf -> Nfbf451b787f04ad5af06dd45791f21c3
  2. https://gaoinnovations.gov/antifraud_resource/howfraudworks/gfo/MoneyLaunderingScheme -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> N1b65cd36622b4dbdb4a4ce535c343ae0
  3. https://gaoinnovations.gov/antifraud_resource/howfraudworks/gfo/StructuredQueryLanguageInjection -> http://www.w3.org/2004/02/skos/core#altLabel -> SQL injection
  4. https://gaoinnovations.gov/antifraud_resource/howfraudworks/gfo/TaxRingSheme -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> Nfb93c48d57d04350a89cb65ca3215c3b
  5. https://gaoinnovations.gov/antifraud_resource/howfraudworks/gfo/Falsified_Quality_Control_Data_Scheme -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://gaoinnovations.gov/antifraud_resource/howfraudworks/gfo/FederalFraudScheme

Uniq

In [15]:
# Look at the types of predicates being used
print("Most common predicates:")
predicate_counts = {}
for s, p, o in g:
    predicate_counts[str(p)] = predicate_counts.get(str(p), 0) + 1

# Show top 10 most used predicates
sorted_predicates = sorted(predicate_counts.items(), key=lambda x: x[1], reverse=True)
for pred, count in sorted_predicates[:10]:
    print(f"  {count:5d} - {pred.split('/')[-1] if '/' in pred else pred}")

Most common predicates:
  16247 - 22-rdf-syntax-ns#type
   6954 - owl#onProperty
   6642 - owl#someValuesFrom
   2067 - rdf-schema#label
   1896 - rdf-schema#subClassOf
   1712 - core#definition
   1511 - rdf-schema#isDefinedBy
    601 - hasWebsite
    596 - adaptedFrom
    368 - is_investigated_by


In [16]:
# Find fraud scheme types
print("Looking for fraud-related classes:")
fraud_classes = []
for s, p, o in g:
    if str(p).endswith('#type') and 'fraud' in str(s).lower():
        fraud_classes.append(str(s))

# Show unique fraud classes (first 10)
unique_fraud = list(set(fraud_classes))[:10]
for fraud_type in unique_fraud:
    class_name = fraud_type.split('/')[-1]
    print(f"  {class_name}")

Looking for fraud-related classes:
  CFPB_Fraud_prevention_resources
  isEnforcedBy
  Pandemic_Unemployment_Assistance_Fraud_Scheme
  ED_OIG_-_Fraud_in_Disaster_Recovery_Aid_Programs
  Federal_Reserve_Board_Enforcement_Actions_List
  FailureToReportChangeInBenefitStatus
  Identitytheft.gov
  SupplementalNutritionAssistanceProgram
  IncorrectDiagnosisOrProcedureReporting
  ServiceDisabledVeteranSmallBusiness


In [17]:
# Let's separate classes from instances more clearly
print("=== Classes vs Instances Analysis ===")

# Get all rdf:type statements
type_statements = []
for s, p, o in g:
    if str(p).endswith('#type'):
        type_statements.append((str(s), str(o)))

print(f"Total type statements: {len(type_statements)}")

# Look for OWL/RDFS classes specifically
owl_classes = set()
for s, p, o in g:
    if str(p).endswith('#subClassOf') or (str(p).endswith('#type') and 'Class' in str(o)):
        owl_classes.add(str(s))

print(f"Classes found via subClassOf: {len(owl_classes)}")

# Show a few examples
print("\nSample classes:")
for i, cls in enumerate(list(owl_classes)[:5]):
    print(f"  {cls.split('/')[-1]}")

=== Classes vs Instances Analysis ===
Total type statements: 16247
Classes found via subClassOf: 1010

Sample classes:
  Citizenship
  TimeInstant
  DarkNet
  PaymentCardNumber
  MortgageScheme


In [18]:
# Check for common ontology problems
print("=== Ontology Health Check ===")

# 1. Look for blank nodes (often cause problems)
blank_nodes = set()
for s, p, o in g:
    if str(s).startswith('N') and len(str(s)) == 33:  # Blank node pattern
        blank_nodes.add(str(s))
    if str(o).startswith('N') and len(str(o)) == 33:
        blank_nodes.add(str(o))

print(f"Blank nodes found: {len(blank_nodes)}")

# 2. Check for undefined classes (classes referenced but not defined)
referenced_classes = set()
defined_classes = set()

for s, p, o in g:
    if str(p).endswith('#type'):
        referenced_classes.add(str(o))
    if str(p).endswith('#subClassOf'):
        defined_classes.add(str(s))

undefined = referenced_classes - defined_classes
print(f"Referenced but undefined classes: {len(undefined)}")

# 3. Show a few examples of potential problems
print(f"\nFirst 3 blank nodes:")
for i, bn in enumerate(list(blank_nodes)[:3]):
    print(f"  {bn}")

print(f"\nFirst 3 undefined classes:")
for i, uc in enumerate(list(undefined)[:3]):
    print(f"  {uc.split('/')[-1] if '/' in uc else uc}")

=== Ontology Health Check ===
Blank nodes found: 7069
Referenced but undefined classes: 6058

First 3 blank nodes:
  N6f2ffb09c5494adf86b99d1601a87891
  N9b1fdec153e544898f53626d05342381
  N0a28fa1871144f6584062252723915d0

First 3 undefined classes:
  Nbf6fe42f8c094903a673b28ee539ec19
  N7ca67d9987714a2daec5eca56d1fed2a
  N6f2ffb09c5494adf86b99d1601a87891


In [19]:
# Check if this was a conversion problem from another format
print("=== Investigating Blank Node Usage ===")

# See what blank nodes are being used for
blank_predicates = {}
for s, p, o in g:
    if str(s).startswith('N') and len(str(s)) == 33:
        pred = str(p).split('/')[-1] if '/' in str(p) else str(p)
        blank_predicates[pred] = blank_predicates.get(pred, 0) + 1

print("How blank nodes are used as subjects:")
for pred, count in sorted(blank_predicates.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"  {count:4d} - {pred}")

# Check what's pointing TO blank nodes
pointing_to_blanks = {}
for s, p, o in g:
    if str(o).startswith('N') and len(str(o)) == 33:
        pred = str(p).split('/')[-1] if '/' in str(p) else str(p)
        pointing_to_blanks[pred] = pointing_to_blanks.get(pred, 0) + 1

print("\nProperties pointing TO blank nodes:")
for pred, count in sorted(pointing_to_blanks.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"  {count:4d} - {pred}")

=== Investigating Blank Node Usage ===
How blank nodes are used as subjects:
  6981 - 22-rdf-syntax-ns#type
  6954 - owl#onProperty
  6642 - owl#someValuesFrom
   301 - owl#hasValue
    87 - 22-rdf-syntax-ns#first
    87 - 22-rdf-syntax-ns#rest
    13 - owl#unionOf
    10 - owl#intersectionOf
     8 - owl#onClass
     5 - owl#maxQualifiedCardinality

Properties pointing TO blank nodes:
  6044 - 22-rdf-syntax-ns#type
   886 - rdf-schema#subClassOf
    50 - 22-rdf-syntax-ns#rest
    14 - rdf-schema#range
    14 - owl#equivalentClass
    13 - owl#unionOf
    12 - owl#propertyChainAxiom
    10 - 22-rdf-syntax-ns#first
    10 - owl#intersectionOf
     9 - rdf-schema#domain


In [20]:
# Find simple, named classes (not blank node expressions)
print("=== Finding Simple, Named Classes ===")

simple_classes = set()
for s, p, o in g:
    # Look for named classes (not blank nodes)
    if (str(p).endswith('#subClassOf') and 
        not str(s).startswith('N') and 
        'gaoinnovations.gov' in str(s)):
        simple_classes.add(str(s))

print(f"Simple GAO fraud classes: {len(simple_classes)}")

# Show some examples
print("\nSample fraud classes:")
for i, cls in enumerate(list(simple_classes)[:10]):
    class_name = cls.split('/')[-1]
    print(f"  {class_name}")

# Now find instances of these simple classes
print(f"\n=== Finding Instances ===")
instance_count = 0
for s, p, o in g:
    if (str(p).endswith('#type') and 
        str(o) in simple_classes and
        not str(s).startswith('N')):
        instance_count += 1

print(f"Named instances found: {instance_count}")

=== Finding Simple, Named Classes ===
Simple GAO fraud classes: 967

Sample fraud classes:
  Citizenship
  DarkNet
  PaymentCardNumber
  OversightParticipant
  MortgageScheme
  FederalAgency
  TaxpayerFraud
  InsurancePremiumDiversion
  AccountTakeover
  EmployeePerformance

=== Finding Instances ===
Named instances found: 1107


In [21]:
# Show some actual fraud scheme instances
print("=== Sample Fraud Instances ===")
instances = []
for s, p, o in g:
    if (str(p).endswith('#type') and 
        str(o) in simple_classes and
        not str(s).startswith('N')):
        instances.append((str(s), str(o)))

# Show first 10 instances with their classes
for i, (instance, class_uri) in enumerate(instances[:10]):
    instance_name = instance.split('/')[-1]
    class_name = class_uri.split('/')[-1]
    print(f"  {instance_name} → {class_name}")

=== Sample Fraud Instances ===
  Falsified_Quality_Control_Data_Scheme → FederalFraudScheme
  Defense_Criminal_Investigative_Service → FederalUnit
  CommodityFuturesTradingCommissionOIG → federal_office_of_inspector_general
  UK_National_Audit_Office_Good_Practice_in_Tackling_External_Fraud → FraudRiskManagementPrinciples
  GAO-23-106696 → GovernmentAccountabilityOfficeProduct
  CashBackMortgageScheme → FederalFraudScheme
  AGA_Training_Material → FraudDetection
  TemporaryAssistanceforNeedyFamilies → SocialSecurityNumber
  DOE_Education_Risk_Management_Tools → FraudDetection
  CallCenterMoneyMuleScheme → FederalFraudScheme


In [22]:
# Test your working SPARQL query
print("=== Testing Your Working SPARQL Query ===")

# Define the query (your working version)
sparql_query = """
PREFIX gfo: <https://gaoinnovations.gov/antifraud_resource/howfraudworks/gfo/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
 
SELECT DISTINCT ?individual ?individualName
WHERE {
    ?individual a gfo:FederalFraudScheme ;
                rdfs:label ?individualName .
   
    {
        # Path 1: Through involves property
        ?individual a ?someClass .
        ?someClass owl:onProperty gfo:involves ;
                   owl:someValuesFrom ?specificFraud .
       
        ?specificFraud rdfs:subClassOf* ?fraudType .
        ?fraudType rdfs:label ?fraudTypeName .
       
        ?specificFraud rdfs:subClassOf* gfo:public_emergency_fraud .
    }
    UNION
    {
        # Path 2: Direct instance of subclass
        ?individual a ?fraudSchemeClass .
        ?fraudSchemeClass rdfs:subClassOf* gfo:public_emergency_fraud .
        ?fraudSchemeClass rdfs:subClassOf* ?fraudType .
        ?fraudType rdfs:label ?fraudTypeName .
       
        FILTER(?fraudSchemeClass != gfo:FederalFraudScheme)
    }
    
    # Filter out the top-level FraudActivity class
    FILTER(?fraudType != gfo:FraudActivity)
}
ORDER BY ?individualName
"""

# Run the query
results = g.query(sparql_query)

print(f"Results found: {len(list(results))}")

# Reset results (queries can only be iterated once)
results = g.query(sparql_query)

print("\nFraud schemes related to public emergency fraud:")
for i, row in enumerate(results):
    individual_name = row.individualName
    individual_uri = row.individual
    print(f"  {i+1}. {individual_name}")
    
    # Show just the class name, not full URI
    class_name = str(individual_uri).split('/')[-1]
    print(f"     URI: {class_name}")

=== Testing Your Working SPARQL Query ===
Results found: 26

Fraud schemes related to public emergency fraud:
  1. Beneficiary Fraud to Obtain Coronavirus Food Assistance Program Funds Participants Scheme
     URI: Beneficiary_Fraud_Scheme_to_Obtain_Coronavirus_Food_Assistance_Program_CFAP_Funds_Participants
  2. Bogus Employees Used to Obtain Pandemic Relief Funds Scheme
     URI: Bogus_Employees_Used_to_Obtain_Pandemic_Relief_Funds
  3. COVID-19 Related Medical Equipment Scheme
     URI: COVID-19_Related_Medical_Equipment_Scheme
  4. COVID-19 Testing Billing Scheme
     URI: COVID-19_Testing_Billing_Scheme
  5. College Students Defraud Higher Education Relief Fund Scheme
     URI: College_Students_Defraud_Higher_Education_Emergency_Relieve_Fund_Scheme
  6. Complex Federal Child Nutrition Program Fraud Scheme
     URI: Complex_Federal_Child_Nutrition_Program_Scheme
  7. Fraudster Operating from Within to Defraud the Unemployment Insurance Program Scheme
     URI: Fraudster_Operating_f

In [23]:
# === COMPARISON TEST: Old vs New SPARQL Query ===
print("=== COMPARISON TEST: Finding Public Emergency Fraud Schemes ===\n")

# Method 1: Current Streamlit approach (simple search)
print("METHOD 1: Current Streamlit approach (simple pattern matching)")
print("-" * 60)

# Simulate what Streamlit currently does - look for classes and instances with pattern matching
simple_results = []
search_term = "public_emergency_fraud"

# Search classes
for s, p, o in g:
    if (str(p).endswith('#type') and 
        search_term.lower() in str(s).lower()):
        simple_results.append(str(s))

# Search individuals 
for s, p, o in g:
    if (str(p).endswith('#type') and 
        'FederalFraudScheme' in str(o) and
        search_term.lower() in str(s).lower()):
        simple_results.append(str(s))

print(f"Simple search results: {len(simple_results)}")
for i, result in enumerate(simple_results[:10]):  # Show first 10
    name = result.split('/')[-1]
    print(f"  {i+1}. {name}")

if len(simple_results) > 10:
    print(f"  ... and {len(simple_results) - 10} more")

print(f"\nTotal found by simple search: {len(simple_results)}")

# Method 2: Your working SPARQL query
print("\n" + "="*60)
print("METHOD 2: Your working SPARQL query (with transitive closure)")
print("-" * 60)

# Your working query
sparql_query = """
PREFIX gfo: <https://gaoinnovations.gov/antifraud_resource/howfraudworks/gfo/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
 
SELECT DISTINCT ?individual ?individualName
WHERE {
    ?individual a gfo:FederalFraudScheme ;
                rdfs:label ?individualName .
   
    {
        # Path 1: Through involves property
        ?individual a ?someClass .
        ?someClass owl:onProperty gfo:involves ;
                   owl:someValuesFrom ?specificFraud .
       
        ?specificFraud rdfs:subClassOf* ?fraudType .
        ?fraudType rdfs:label ?fraudTypeName .
       
        ?specificFraud rdfs:subClassOf* gfo:public_emergency_fraud .
    }
    UNION
    {
        # Path 2: Direct instance of subclass
        ?individual a ?fraudSchemeClass .
        ?fraudSchemeClass rdfs:subClassOf* gfo:public_emergency_fraud .
        ?fraudSchemeClass rdfs:subClassOf* ?fraudType .
        ?fraudType rdfs:label ?fraudTypeName .
       
        FILTER(?fraudSchemeClass != gfo:FederalFraudScheme)
    }
    
    # Filter out the top-level FraudActivity class
    FILTER(?fraudType != gfo:FraudActivity)
}
ORDER BY ?individualName
"""

sparql_results = list(g.query(sparql_query))
print(f"SPARQL query results: {len(sparql_results)}")

# Show first 10 for comparison
for i, row in enumerate(sparql_results[:10]):
    print(f"  {i+1}. {row.individualName}")

if len(sparql_results) > 10:
    print(f"  ... and {len(sparql_results) - 10} more")

print(f"\nTotal found by SPARQL query: {len(sparql_results)}")

# Comparison summary
print("\n" + "="*60)
print("COMPARISON SUMMARY")
print("-" * 60)
print(f"Simple search found:    {len(simple_results)} results")
print(f"SPARQL query found:     {len(sparql_results)} results")
print(f"Difference:             {len(sparql_results) - len(simple_results)} additional results")

if len(sparql_results) > len(simple_results):
    print(f"\n[SUCCESS] SPARQL query found {len(sparql_results) - len(simple_results)} more results!")
    print("This confirms the SPARQL query is capturing indirect relationships")
    print("that the simple search approach misses.")

=== COMPARISON TEST: Finding Public Emergency Fraud Schemes ===

METHOD 1: Current Streamlit approach (simple pattern matching)
------------------------------------------------------------
Simple search results: 1
  1. public_emergency_fraud

Total found by simple search: 1

METHOD 2: Your working SPARQL query (with transitive closure)
------------------------------------------------------------
SPARQL query results: 26
  1. Beneficiary Fraud to Obtain Coronavirus Food Assistance Program Funds Participants Scheme
  2. Bogus Employees Used to Obtain Pandemic Relief Funds Scheme
  3. COVID-19 Related Medical Equipment Scheme
  4. COVID-19 Testing Billing Scheme
  5. College Students Defraud Higher Education Relief Fund Scheme
  6. Complex Federal Child Nutrition Program Fraud Scheme
  7. Fraudster Operating from Within to Defraud the Unemployment Insurance Program Scheme
  8. Identity Theft from a COVID-19 Economic Injury Disaster Loan Program Advance Fraud Scheme
  9. Inflated Company P

In [1]:
# Find all fraud activity classes in your ontology
print("=== All Fraud Activity Classes ===")
fraud_activities = []

for s, p, o in g:
    if (str(p).endswith('#subClassOf') and 
        'FraudActivity' in str(o)):
        class_uri = str(s).split('/')[-1]
        fraud_activities.append(class_uri)

print("Direct subclasses of FraudActivity:")
for activity in sorted(fraud_activities):
    print(f"  gfo:{activity}")

# Also check for labels
print("\n=== With Labels ===")
for s, p, o in g:
    if (str(p).endswith('#subClassOf') and 
        'FraudActivity' in str(o)):
        class_uri = str(s).split('/')[-1]
        
        # Find the label
        for s2, p2, o2 in g:
            if (str(s2) == str(s) and 
                str(p2).endswith('#label')):
                print(f"  {str(o2)} -> gfo:{class_uri}")
                break

=== All Fraud Activity Classes ===


NameError: name 'g' is not defined

In [2]:
# Load the RDF graph first
import rdflib

g = rdflib.Graph()
g.parse("/app/ontologies/gfo_rdf_xml.rdf", format="xml")
print(f"Loaded {len(g)} triples")

Loaded 41927 triples


In [3]:
# Find all fraud activity classes in your ontology
print("=== All Fraud Activity Classes ===")
fraud_activities = []

for s, p, o in g:
    if (str(p).endswith('#subClassOf') and 
        'FraudActivity' in str(o)):
        class_uri = str(s).split('/')[-1]
        fraud_activities.append(class_uri)

print("Direct subclasses of FraudActivity:")
for activity in sorted(fraud_activities):
    print(f"  gfo:{activity}")

# Also check for labels
print("\n=== With Labels ===")
for s, p, o in g:
    if (str(p).endswith('#subClassOf') and 
        'FraudActivity' in str(o)):
        class_uri = str(s).split('/')[-1]
        
        # Find the label
        for s2, p2, o2 in g:
            if (str(s2) == str(s) and 
                str(p2).endswith('#label')):
                print(f"  {str(o2)} -> gfo:{class_uri}")
                break

=== All Fraud Activity Classes ===
Direct subclasses of FraudActivity:
  gfo:AssistanceFraud
  gfo:BeneficiaryFraud
  gfo:CellphoneFraud
  gfo:CharityFraud
  gfo:ConfidenceFraud
  gfo:ConsumerFraud
  gfo:ContractFraud
  gfo:CorporateFraud
  gfo:Corruption
  gfo:CyberEspionage
  gfo:Cyberextortion
  gfo:ElectionFraud
  gfo:EnvironmentalFraud
  gfo:FinancialInstitutionFraud
  gfo:GovernmentFurnishedEquipmentFraud
  gfo:GrantFraud
  gfo:HealthcareFraud
  gfo:HousingFraud
  gfo:IdentityFraud
  gfo:InsuranceFraud
  gfo:InvestmentFraud
  gfo:LaboratoryFraud
  gfo:LienFillingFraud
  gfo:LoanFraud
  gfo:MailFraud
  gfo:MediaManipulation
  gfo:PaymentFraud
  gfo:ProcurementFraud
  gfo:SanctionEvasion
  gfo:StudentFinancialAidFraud
  gfo:TaxFraud
  gfo:Trafficking
  gfo:VisaFraud
  gfo:WireFraud
  gfo:WorkplaceFraud
  gfo:public_emergency_fraud
  gfo:supervised_release

=== With Labels ===
  procurement fraud -> gfo:ProcurementFraud
  mail fraud -> gfo:MailFraud
  charity fraud -> gfo:CharityFra