# USDA Commodity Matcher Workflow

This notebook walks through intelligent commodity matching using fuzzy matching:
1. **Load Matcher**: Initialize the commodity matching module
2. **Show Commodities**: Display available USDA commodities
3. **Query Resources**: Find unmapped resources
4. **Test Fuzzy Matching**: See similarity scoring in action
5. **Apply Matches**: Store mappings in database
6. **Verify Results**: Confirm all mappings successful

**Goal**: Intelligently map all resources to USDA commodity codes âœ“

## Step 1: Environment Setup

In [None]:
import os
import sys
import json
from pathlib import Path
from datetime import datetime
import pandas as pd
from sqlalchemy import create_engine, text
from difflib import SequenceMatcher

# Configure environment
workspace_root = Path(r'c:\Users\meili\forked\ca-biositing')
sys.path.insert(0, str(workspace_root / 'src' / 'ca_biositing' / 'pipeline'))
sys.path.insert(0, str(workspace_root / 'src' / 'ca_biositing' / 'datamodels'))
os.chdir(str(workspace_root))

from dotenv import load_dotenv
load_dotenv(workspace_root / '.env')

engine = create_engine(os.getenv('DATABASE_URL'))

print("âœ“ Environment configured")
print(f"âœ“ Working directory: {os.getcwd()}")
print(f"âœ“ Database connected")

## Step 2: Display Available USDA Commodities

In [None]:
with engine.connect() as conn:
    usda_commodities = pd.read_sql(
        text("""
            SELECT id, usda_code, commodity_name, description
            FROM usda_commodity
            ORDER BY commodity_name
        """),
        conn
    )

print(f"Available USDA Commodities ({len(usda_commodities)} total):")
print("="*60)
print(usda_commodities.to_string(index=False))

## Step 3: Query Resources Needing Mappings

In [None]:
with engine.connect() as conn:
    unmapped_resources = pd.read_sql(
        text("""
            SELECT DISTINCT r.id, r.name as resource_name
            FROM resource r
            LEFT JOIN resource_usda_commodity_map rum ON r.id = rum.resource_id
            WHERE rum.resource_id IS NULL
            ORDER BY r.name
        """),
        conn
    )

print(f"Resources Needing Commodity Mapping: {len(unmapped_resources)} total")
print("="*60)

if len(unmapped_resources) > 0:
    print(unmapped_resources.head(15).to_string(index=False))
    if len(unmapped_resources) > 15:
        print(f"... and {len(unmapped_resources) - 15} more")
else:
    print("âœ“ All resources already have commodity mappings!")

## Step 4: Test Fuzzy Matching Algorithm

In [None]:
if len(unmapped_resources) > 0:
    test_resource = unmapped_resources.iloc[0]['resource_name']
    test_id = unmapped_resources.iloc[0]['id']
    
    print(f"Testing Fuzzy Matching on: '{test_resource}'")
    print("="*60)
    
    # Calculate similarity scores
    matches = []
    for _, commodity in usda_commodities.iterrows():
        similarity = SequenceMatcher(None, 
                                     test_resource.lower(), 
                                     commodity['commodity_name'].lower()).ratio()
        matches.append({
            'commodity_name': commodity['commodity_name'],
            'usda_code': commodity['usda_code'],
            'similarity': f"{similarity:.1%}"
        })
    
    # Sort by similarity
    matches_df = pd.DataFrame(matches).sort_values('similarity', ascending=False)
    
    print(f"\nTop 5 Matches (sorted by similarity score):")
    print(matches_df.head(5).to_string(index=False))
    print(f"\nâœ“ Best match: {matches_df.iloc[0]['commodity_name']} ({matches_df.iloc[0]['similarity']})")
else:
    print("âœ“ No unmapped resources to test")

## Step 5: Load/Create Pending Matches

In [None]:
pending_matches_file = workspace_root / '.usda_pending_matches.json'

if pending_matches_file.exists():
    with open(pending_matches_file, 'r') as f:
        pending_matches = json.load(f)
    print(f"âœ“ Loaded {len(pending_matches)} existing pending matches")
    print(f"  From: {pending_matches_file}")
else:
    pending_matches = {}
    print(f"âœ“ Starting fresh (no existing matches)")
    print(f"  Will create: {pending_matches_file}")

print(f"\nPending matches tracked in: {pending_matches_file}")

## Step 6: View Pending Matches

In [None]:
if len(pending_matches) > 0:
    print(f"Pending Matches Summary ({len(pending_matches)} total):")
    print("="*60)
    
    approved_count = sum(1 for m in pending_matches.values() if m.get('status') == 'approved')
    applied_count = sum(1 for m in pending_matches.values() if m.get('status') == 'applied')
    pending_count = sum(1 for m in pending_matches.values() if m.get('status') == 'pending')
    
    print(f"  Status breakdown:")
    print(f"    - Approved: {approved_count}")
    print(f"    - Applied: {applied_count}")
    print(f"    - Pending: {pending_count}")
    
    print(f"\n  Sample matches (first 3):")
    for i, (resource_id, match_info) in enumerate(list(pending_matches.items())[:3]):
        print(f"    {i+1}. {match_info.get('resource_name')} â†’ {match_info.get('commodity_name')} ({match_info.get('status')})")
else:
    print("No pending matches yet. Run the matcher to create some.")

## Step 7: Apply Approved Matches to Database

In [None]:
print("Applying Approved Matches to Database:")
print("="*60)

applied_count = 0
skipped_count = 0

if len(pending_matches) == 0:
    print("No pending matches to apply.")
else:
    with engine.connect() as conn:
        for resource_id, match_info in pending_matches.items():
            if match_info.get('status') == 'approved':
                try:
                    usda_code = match_info.get('usda_code')
                    result = conn.execute(
                        text("SELECT id FROM usda_commodity WHERE usda_code = :code"),
                        {'code': str(usda_code)}
                    )
                    usda_id = result.fetchone()
                    
                    if usda_id:
                        conn.execute(
                            text("""
                                INSERT INTO resource_usda_commodity_map 
                                (resource_id, usda_commodity_id, created_at)
                                VALUES (:resource_id, :usda_commodity_id, :created_at)
                                ON CONFLICT (resource_id, usda_commodity_id) DO NOTHING
                            """),
                            {
                                'resource_id': int(resource_id),
                                'usda_commodity_id': usda_id[0],
                                'created_at': datetime.now()
                            }
                        )
                        
                        match_info['status'] = 'applied'
                        match_info['applied_at'] = datetime.now().isoformat()
                        
                        print(f"  âœ“ {match_info['resource_name']} â†’ {match_info['commodity_name']}")
                        applied_count += 1
                except Exception as e:
                    print(f"  âœ— Error: {str(e)[:50]}")
            else:
                skipped_count += 1
        
        conn.commit()

print(f"\nâœ“ Applied: {applied_count} mappings")
print(f"âŠ˜ Skipped: {skipped_count} unapproved matches")

## Step 8: Verify Mappings in Database

In [None]:
print("Querying Resource â†” USDA Commodity Mappings:")
print("="*80)

with engine.connect() as conn:
    result = pd.read_sql(
        text("""
            SELECT 
                r.name as resource_name,
                uc.commodity_name,
                uc.usda_code,
                rum.created_at
            FROM resource_usda_commodity_map rum
            JOIN resource r ON r.id = rum.resource_id
            JOIN usda_commodity uc ON uc.id = rum.usda_commodity_id
            ORDER BY rum.created_at DESC
            LIMIT 20
        """),
        conn
    )

if len(result) > 0:
    print(f"âœ“ Found {len(result)} mappings in database:")
    print(f"\n{result.to_string(index=False)}")
else:
    print("No mappings found in database yet.")

## Step 9: Summary Report

In [None]:
print("\n" + "="*60)
print("COMMODITY MATCHER - SUMMARY REPORT")
print("="*60)

# Get statistics
with engine.connect() as conn:
    total_resources = conn.execute(text("SELECT COUNT(*) FROM resource")).fetchone()[0]
    mapped_resources = conn.execute(text(
        "SELECT COUNT(DISTINCT resource_id) FROM resource_usda_commodity_map"
    )).fetchone()[0]
    unmapped_resources_count = total_resources - mapped_resources

print(f"\nResource Mapping Coverage:")
print(f"  Total Resources: {total_resources}")
print(f"  Mapped to USDA: {mapped_resources}")
print(f"  Awaiting Match: {unmapped_resources_count}")

if total_resources > 0:
    coverage = (mapped_resources/total_resources*100)
    print(f"  Coverage: {coverage:.1f}%")
    
    if coverage == 100:
        print(f"\nðŸŽ‰ ALL RESOURCES MAPPED!")
    elif coverage >= 75:
        print(f"\nâœ“ Good coverage - {unmapped_resources_count} resources still need mapping")
    else:
        print(f"\nâš  {unmapped_resources_count} resources still need mapping")

print(f"\n{'='*60}")
print(f"âœ“ Commodity matching workflow complete!")
print(f"{'='*60}")

# USDA Commodity Matcher Workflow

This notebook walks through matching resources in the database to USDA commodity codes using fuzzy matching:
1. **Search**: Find matching USDA commodities for resources
2. **Review**: Inspect pending matches in JSON format
3. **Apply**: Store approved matches in the database
4. **Verify**: Confirm mappings are correct

**Goal**: Intelligently match all resources to USDA commodity codes for ETL integration.

## Step 1: Environment Setup

In [None]:
import os
import sys
from pathlib import Path
import json

# Configure PYTHONPATH
workspace_root = Path(r'c:\Users\meili\forked\ca-biositing')
sys.path.insert(0, str(workspace_root / 'src' / 'ca_biositing' / 'pipeline'))
sys.path.insert(0, str(workspace_root / 'src' / 'ca_biositing' / 'datamodels'))
sys.path.insert(0, str(workspace_root / 'src' / 'ca_biositing' / 'webservice'))

os.chdir(str(workspace_root))

# Load environment
from dotenv import load_dotenv
load_dotenv(workspace_root / '.env')

print("âœ“ Environment configured")
print(f"âœ“ Working directory: {os.getcwd()}")

## Step 2: Load Commodity Matcher Module

In [None]:
# Import the commodity matcher script
import importlib.util

matcher_path = workspace_root / 'match_usda_commodities.py'
spec = importlib.util.spec_from_file_location("commodity_matcher", matcher_path)
matcher = importlib.util.module_from_spec(spec)
spec.loader.exec_module(matcher)

print(f"âœ“ Commodity Matcher loaded from {matcher_path}")
print(f"\nAvailable functions:")
print(f"  - search_commodity()")
print(f"  - load_pending_matches()")
print(f"  - save_pending_matches()")
print(f"  - apply_pending_matches_to_db()")

## Step 3: Display Available USDA Commodities

In [None]:
import pandas as pd
from sqlalchemy import create_engine, text

# Connect to database
engine = create_engine(os.getenv('DATABASE_URL'))

# Get all USDA commodities
with engine.connect() as conn:
    result = pd.read_sql(
        text("""
            SELECT id, usda_code, commodity_name, description
            FROM usda_commodity
            ORDER BY commodity_name
        """),
        conn
    )

print(f"Available USDA Commodities ({len(result)} total):")
print("="*60)
print(result.to_string(index=False))

usda_commodities = result

## Step 4: Query Resources to Match

In [None]:
# Get resources that need commodity mapping
with engine.connect() as conn:
    result = pd.read_sql(
        text("""
            SELECT DISTINCT r.id, r.name as resource_name
            FROM resource r
            LEFT JOIN resource_usda_commodity_map rum ON r.id = rum.resource_id
            WHERE rum.resource_id IS NULL
            ORDER BY r.name
        """),
        conn
    )

print(f"Resources needing commodity mapping: {len(result)}")
print("="*60)

if len(result) > 0:
    print(result.head(10).to_string(index=False))
    if len(result) > 10:
        print(f"... and {len(result) - 10} more")
else:
    print("âœ“ All resources already have commodity mappings!")

unmapped_resources = result

## Step 5: Test Fuzzy Matching on Single Resource

In [None]:
from difflib import SequenceMatcher

# Test fuzzy matching on first unmapped resource
if len(unmapped_resources) > 0:
    test_resource = unmapped_resources.iloc[0]['resource_name']
    test_id = unmapped_resources.iloc[0]['id']
    
    print(f"Testing Fuzzy Matching on: '{test_resource}'")
    print("="*60)
    
    # Calculate similarity scores
    matches = []
    for _, commodity in usda_commodities.iterrows():
        similarity = SequenceMatcher(None, 
                                     test_resource.lower(), 
                                     commodity['commodity_name'].lower()).ratio()
        matches.append({
            'commodity_name': commodity['commodity_name'],
            'usda_code': commodity['usda_code'],
            'similarity': similarity
        })
    
    # Sort by similarity
    matches_df = pd.DataFrame(matches).sort_values('similarity', ascending=False)
    
    print(f"\nTop 5 matches:")
    print(matches_df.head(5).to_string(index=False))
    print(f"\nBest match: {matches_df.iloc[0]['commodity_name']} (score: {matches_df.iloc[0]['similarity']:.2%})")
else:
    print("âœ“ No unmapped resources to test")

## Step 6: Run Commodity Matcher on All Resources