# NER Discovery Testing Notebook

This notebook tests the "Potential New Organizations" discovery functionality from the Streamlit app.
It allows you to test the NER extraction logic, fuzzy matching, and filtering rules in isolation.

## Features:
1. Test NER extraction with the same model used in the app
2. Test fuzzy matching against database organizations
3. Examine filtering rules (generic terms, short terms, etc.)
4. Debug why organizations appear/disappear between runs
5. Test with sample documents

In [None]:
import pandas as pd
import numpy as np
import re
import json
import os
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

# Import the same libraries used in the Streamlit app
from transformers import AutoTokenizer, pipeline
from fuzzywuzzy import fuzz, process
import psycopg2
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

print("Setup complete!")

## 1. Create a Simplified OrganizationExtractor Class for Testing

In [None]:
class TestOrganizationExtractor:
    """Simplified version of OrganizationExtractor for testing NER discovery"""
    
    def __init__(self, postgres_config: Dict[str, str] = None):
        self.min_confidence = 0.85
        self.min_org_length = 3
        
        # Initialize components
        self.master_orgs_df = pd.DataFrame()
        self.org_lookup = {}
        self.known_short_orgs = set()
        self.tokenizer = None
        self.ner_model = None
        
        # Initialize system
        self._initialize_system(postgres_config)
    
    def _initialize_system(self, postgres_config):
        """Initialize system components"""
        try:
            # Initialize tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
            
            # Load NER model (same as in the app)
            self.ner_model = pipeline(
                "ner",
                model="dbmdz/bert-large-cased-finetuned-conll03-english",
                aggregation_strategy="simple",
                device=-1
            )
            
            # Load organizations from database or use sample data
            if postgres_config:
                self._load_organizations_from_db(postgres_config)
            else:
                self._load_sample_organizations()
            
            self._build_lookup()
            print(f"Initialized with {len(self.master_orgs_df)} organizations in database")
            print(f"Created {len(self.org_lookup)} lookup entries")
            
        except Exception as e:
            print(f"System initialization failed: {e}")
    
    def _load_organizations_from_db(self, postgres_config):
        """Load organizations from database"""
        try:
            conn = psycopg2.connect(
                host=postgres_config['host'],
                port=postgres_config.get('port', 5432),
                database=postgres_config['database'],
                user=postgres_config['user'],
                password=postgres_config['password']
            )
            
            query = """
            SELECT org_id, org_name
            FROM verdantix.org
            WHERE org_name IS NOT NULL 
                AND LENGTH(TRIM(org_name)) > 2
            ORDER BY org_name
            """
            
            self.master_orgs_df = pd.read_sql_query(query, conn)
            conn.close()
            
        except Exception as e:
            print(f"Database connection failed: {e}")
            print("Using sample data instead...")
            self._load_sample_organizations()
    
    def _load_sample_organizations(self):
        """Load sample organizations for testing"""
        sample_orgs = [
            {'org_id': 1, 'org_name': 'Microsoft Corporation'},
            {'org_id': 2, 'org_name': 'Apple Inc.'},
            {'org_id': 3, 'org_name': 'Google LLC'},
            {'org_id': 4, 'org_name': 'Amazon.com Inc.'},
            {'org_id': 5, 'org_name': 'Meta Platforms Inc.'},
            {'org_id': 6, 'org_name': 'Tesla Inc.'},
            {'org_id': 7, 'org_name': 'NVIDIA Corporation'},
            {'org_id': 8, 'org_name': 'JPMorgan Chase & Co.'},
            {'org_id': 9, 'org_name': 'Johnson & Johnson'},
            {'org_id': 10, 'org_name': 'Berkshire Hathaway Inc.'},
            {'org_id': 11, 'org_name': 'Alphabet Inc.'},
            {'org_id': 12, 'org_name': 'Salesforce Inc.'},
            {'org_id': 13, 'org_name': 'Oracle Corporation'},
            {'org_id': 14, 'org_name': 'IBM'},
            {'org_id': 15, 'org_name': 'Intel'},
        ]
        
        self.master_orgs_df = pd.DataFrame(sample_orgs)
    
    def _generate_aliases(self, org_name: str) -> List[str]:
        """Generate aliases for organization names (same as in app)"""
        aliases = []
        
        # Extract acronyms from parentheses
        paren_pattern = r'\(([^)]+)\)'
        paren_matches = re.findall(paren_pattern, org_name)
        for match in paren_matches:
            cleaned = match.strip()
            if 2 <= len(cleaned) <= 10:
                aliases.append(cleaned)
        
        # Remove business suffixes
        suffix_patterns = [
            r'\s+(?:Inc\.?|Corporation|Corp\.?|Company|Co\.?|Limited|Ltd\.?)',
            r'\s+(?:LLC|LLP|LP|PLC|Group|Holdings?)'
        ]
        
        for pattern in suffix_patterns:
            base = re.sub(pattern + r'$', '', org_name, flags=re.IGNORECASE).strip()
            if base != org_name and len(base) > 2:
                aliases.append(base)
        
        # Create acronyms from main text
        main_text = re.sub(paren_pattern, '', org_name).strip()
        words = main_text.split()
        if len(words) > 1:
            skip_words = {'of', 'the', 'and', 'for', 'in', 'on', 'at', 'to', 'a', 'an', '&'}
            meaningful_words = [w for w in words if w.lower() not in skip_words and len(w) > 0]
            if len(meaningful_words) > 1:
                acronym = ''.join([w[0].upper() for w in meaningful_words])
                if 2 <= len(acronym) <= 8:
                    aliases.append(acronym)
        
        return list(set(aliases))
    
    def _build_lookup(self):
        """Build organization lookup dictionary (same as in app)"""
        self.org_lookup = {}
        self.known_short_orgs = set()
        
        for _, org in self.master_orgs_df.iterrows():
            org_id = org['org_id']
            org_name = str(org['org_name']).strip()
            
            if len(org_name) > 2:
                self.org_lookup[org_name.lower()] = {
                    'org_id': org_id,
                    'canonical': org_name,
                    'confidence': 1.0
                }
                
                # Track short orgs from database
                if len(org_name) < 8 and ' ' not in org_name:
                    self.known_short_orgs.add(org_name.lower())
                
                # Add aliases
                aliases = self._generate_aliases(org_name)
                for alias in aliases:
                    if alias.lower() not in self.org_lookup and len(alias) > 2:
                        self.org_lookup[alias.lower()] = {
                            'org_id': org_id,
                            'canonical': org_name,
                            'confidence': 0.85
                        }
                        
                        if len(alias) < 8 and ' ' not in alias:
                            self.known_short_orgs.add(alias.lower())
    
    def _is_generic_term(self, term: str) -> bool:
        """Filter generic business terms (same as in app)"""
        generic_terms = {
            'ai', 'iot', 'esg', 'api', 'cloud', 'data', 'tech', 'digital',
            'smart', 'green', 'cyber', 'auto', 'bio', 'blockchain', 'fintech',
            'saas', 'crm', 'erp', 'hr', 'it', 'covid', 'gdpr'
        }
        return term.lower() in generic_terms
    
    def extract_organizations_debug(self, text: str, debug=True) -> Tuple[List[Dict], List[Dict], Dict]:
        """Extract organizations with detailed debugging info"""
        
        debug_info = {
            'text_length': len(text),
            'ner_predictions': [],
            'db_exact_matches': [],
            'fuzzy_matches': [],
            'filtered_out': [],
            'final_discoveries': []
        }
        
        db_matches = []
        ner_discoveries = []
        
        # Method 1: Database matches (exact)
        text_lower = text.lower()
        sorted_terms = sorted(self.org_lookup.keys(), key=len, reverse=True)
        matched_positions = set()
        
        for term in sorted_terms:
            if (len(term) >= self.min_org_length and 
                not self._is_generic_term(term)):
                
                # Short term filtering
                if len(term) < 8 and ' ' not in term:
                    if term.lower() not in self.known_short_orgs:
                        if debug:
                            debug_info['filtered_out'].append({
                                'term': term,
                                'reason': 'short_term_not_in_known_list'
                            })
                        continue
                
                pattern = r'\b' + re.escape(term) + r'\b'
                
                for match in re.finditer(pattern, text_lower):
                    start, end = match.span()
                    
                    if not any(start < e and s < end for s, e in matched_positions):
                        matched_positions.add((start, end))
                        
                        org_info = self.org_lookup[term]
                        
                        match_data = {
                            'text': text[start:end],
                            'canonical': org_info['canonical'],
                            'confidence': org_info['confidence'],
                            'org_id': org_info['org_id'],
                            'method': 'database'
                        }
                        
                        db_matches.append(match_data)
                        
                        if debug:
                            debug_info['db_exact_matches'].append(match_data)
        
        # Method 2: NER for new organizations
        if self.ner_model:
            try:
                if len(text) > 4000:
                    chunks = [text[i:i+4000] for i in range(0, len(text), 3500)]
                else:
                    chunks = [text]
                
                for chunk in chunks:
                    predictions = self.ner_model(chunk)
                    
                    if debug:
                        debug_info['ner_predictions'].extend(predictions)
                    
                    for pred in predictions:
                        if 'ORG' in pred.get('entity_group', ''):
                            org_text = pred['word'].strip()
                            
                            # Clean tokenization artifacts
                            org_text = re.sub(r'^##', '', org_text)
                            org_text = re.sub(r'[^\w\s&.-]', '', org_text)
                            org_text = ' '.join(org_text.split())
                            
                            if (len(org_text) >= self.min_org_length and
                                pred['score'] >= self.min_confidence and
                                not self._is_generic_term(org_text)):
                                
                                # Check if already in database
                                if org_text.lower() not in self.org_lookup:
                                    # Try fuzzy matching with consistent results
                                    if len(self.master_orgs_df) > 0:
                                        canonical_names = self.master_orgs_df['org_name'].tolist()
                                        
                                        # Sort for deterministic results
                                        canonical_names = sorted(canonical_names)
                                        
                                        best_match = process.extractOne(
                                            org_text, canonical_names, scorer=fuzz.ratio
                                        )
                                        
                                        fuzzy_score = best_match[1] if best_match else 0
                                        
                                        fuzzy_info = {
                                            'org_text': org_text,
                                            'best_match': best_match[0] if best_match else None,
                                            'fuzzy_score': fuzzy_score,
                                            'ner_confidence': pred['score']
                                        }
                                        
                                        if debug:
                                            debug_info['fuzzy_matches'].append(fuzzy_info)
                                        
                                        if best_match and best_match[1] >= 85:
                                            # Close match found - add to DB matches
                                            matched_row = self.master_orgs_df[
                                                self.master_orgs_df['org_name'] == best_match[0]
                                            ].iloc[0]
                                            
                                            fuzzy_match_data = {
                                                'text': org_text,
                                                'canonical': matched_row['org_name'],
                                                'confidence': pred['score'] * (best_match[1] / 100),
                                                'org_id': matched_row['org_id'],
                                                'method': 'ner_fuzzy',
                                                'fuzzy_score': fuzzy_score
                                            }
                                            
                                            db_matches.append(fuzzy_match_data)
                                        else:
                                            # New organization discovery
                                            discovery_data = {
                                                'text': org_text,
                                                'confidence': pred['score'],
                                                'method': 'ner_new',
                                                'best_fuzzy_match': best_match[0] if best_match else None,
                                                'fuzzy_score': fuzzy_score
                                            }
                                            
                                            ner_discoveries.append(discovery_data)
                                            
                                            if debug:
                                                debug_info['final_discoveries'].append(discovery_data)
                                else:
                                    if debug:
                                        debug_info['filtered_out'].append({
                                            'term': org_text,
                                            'reason': 'already_in_database'
                                        })
                            else:
                                if debug:
                                    reasons = []
                                    if len(org_text) < self.min_org_length:
                                        reasons.append('too_short')
                                    if pred['score'] < self.min_confidence:
                                        reasons.append('low_confidence')
                                    if self._is_generic_term(org_text):
                                        reasons.append('generic_term')
                                    
                                    debug_info['filtered_out'].append({
                                        'term': org_text,
                                        'reason': ', '.join(reasons),
                                        'confidence': pred['score']
                                    })
            
            except Exception as e:
                if debug:
                    debug_info['error'] = str(e)
        
        # Deduplicate
        db_matches = self._deduplicate_matches(db_matches)
        ner_discoveries = self._deduplicate_matches(ner_discoveries)
        
        return db_matches, ner_discoveries, debug_info
    
    def _deduplicate_matches(self, matches: List[Dict]) -> List[Dict]:
        """Remove duplicate matches (same as in app)"""
        seen = set()
        deduplicated = []
        
        for match in sorted(matches, key=lambda x: x['confidence'], reverse=True):
            canonical = match.get('canonical', match['text']).lower()
            if canonical not in seen:
                seen.add(canonical)
                deduplicated.append(match)
        
        return deduplicated

print("TestOrganizationExtractor class created!")

## 2. Initialize the Test Extractor

In [None]:
# Try to connect to database, fall back to sample data if not available
postgres_config = None
try:
    postgres_config = {
        'host': os.getenv('POSTGRES_HOST', 'localhost'),
        'port': os.getenv('POSTGRES_PORT', '5432'),
        'database': os.getenv('POSTGRES_DATABASE', 'postgres'),
        'user': os.getenv('POSTGRES_USER', 'postgres'),
        'password': os.getenv('POSTGRES_PASSWORD', '')
    }
    
    # Test connection
    if not all([postgres_config['host'], postgres_config['user'], postgres_config['password']]):
        print("Database credentials not complete, using sample data")
        postgres_config = None
except Exception as e:
    print(f"Database connection failed: {e}")
    postgres_config = None

# Initialize extractor
print("Initializing test extractor...")
extractor = TestOrganizationExtractor(postgres_config)
print("\nExtractor ready!")
print(f"Database organizations loaded: {len(extractor.master_orgs_df)}")
print(f"Lookup entries created: {len(extractor.org_lookup)}")
print(f"Known short orgs: {len(extractor.known_short_orgs)}")

## 3. Test Sample Texts

In [None]:
# Sample test texts - mix of known and unknown organizations
test_texts = [
    # Text with mix of known and potentially unknown orgs
    "Microsoft Corporation announced a partnership with Acme Digital Solutions to develop AI capabilities. The collaboration also involves DataTech Innovations and CloudFlow Systems.",
    
    # Text with variations of known company names
    "Apple Inc reported strong earnings, outperforming Google and Meta. Tesla's stock price also rose following the announcement.",
    
    # Text with potentially new organizations
    "Quantum Computing Solutions partnered with BioTech Laboratories to advance medical research. The initiative is supported by Green Energy Partners and Smart City Technologies.",
    
    # Text with short terms and acronyms
    "IBM and AWS are competing with GCP in the cloud market. The CEO of AMD commented on the partnership between HPE and Dell.",
    
    # Complex text with many organizations
    "The merger between Innovative Software Corp and Digital Transformation Ltd was approved by RegTech Compliance Services. This follows recent acquisitions by Enterprise Solutions Group and Data Analytics Partners."
]

print(f"Created {len(test_texts)} test texts for analysis")
for i, text in enumerate(test_texts, 1):
    print(f"\nText {i}: {text[:100]}...")

## 4. Run NER Discovery Tests

In [None]:
def analyze_text(text, title=""):
    """Analyze a single text and show detailed results"""
    print(f"\n{'='*60}")
    print(f"ANALYZING: {title}")
    print(f"{'='*60}")
    print(f"TEXT: {text}")
    print(f"Length: {len(text)} characters")
    
    # Run extraction with debug info
    db_matches, ner_discoveries, debug_info = extractor.extract_organizations_debug(text, debug=True)
    
    print(f"\nüìä SUMMARY:")
    print(f"  Database Matches: {len(db_matches)}")
    print(f"  New Discoveries: {len(ner_discoveries)}")
    print(f"  Total NER Predictions: {len(debug_info['ner_predictions'])}")
    print(f"  Filtered Out: {len(debug_info['filtered_out'])}")
    
    # Show database matches
    if db_matches:
        print(f"\n‚úÖ DATABASE MATCHES ({len(db_matches)}):")
        for i, match in enumerate(db_matches, 1):
            method_icon = "üéØ" if match['method'] == 'database' else "üîç"
            print(f"  {i}. {method_icon} '{match['text']}' ‚Üí {match['canonical']} (conf: {match['confidence']:.3f}, method: {match['method']})")
            if 'fuzzy_score' in match:
                print(f"      Fuzzy score: {match['fuzzy_score']}%")
    
    # Show new discoveries
    if ner_discoveries:
        print(f"\nüîç POTENTIAL NEW ORGANIZATIONS ({len(ner_discoveries)}):")
        for i, discovery in enumerate(ner_discoveries, 1):
            print(f"  {i}. üÜï '{discovery['text']}' (conf: {discovery['confidence']:.3f})")
            if discovery.get('fuzzy_score', 0) > 0:
                print(f"      Closest match: '{discovery['best_fuzzy_match']}' ({discovery['fuzzy_score']}%)")
            else:
                print(f"      No close matches found")
    
    # Show fuzzy matching details
    if debug_info['fuzzy_matches']:
        print(f"\nüîç FUZZY MATCHING DETAILS:")
        for fuzzy in debug_info['fuzzy_matches']:
            status = "‚úÖ Matched (‚â•85%)" if fuzzy['fuzzy_score'] >= 85 else "‚ùå No match (<85%)"
            print(f"  '{fuzzy['org_text']}' ‚Üí '{fuzzy['best_match']}' ({fuzzy['fuzzy_score']}%) {status}")
    
    # Show filtered out items
    if debug_info['filtered_out']:
        print(f"\n‚ùå FILTERED OUT ({len(debug_info['filtered_out'])}):")
        for filtered in debug_info['filtered_out'][:10]:  # Show first 10
            conf_str = f" (conf: {filtered.get('confidence', 'N/A'):.3f})" if 'confidence' in filtered else ""
            print(f"  '{filtered['term']}' - {filtered['reason']}{conf_str}")
        if len(debug_info['filtered_out']) > 10:
            print(f"  ... and {len(debug_info['filtered_out']) - 10} more")
    
    return db_matches, ner_discoveries, debug_info

# Test each text
all_results = []
for i, text in enumerate(test_texts, 1):
    db_matches, ner_discoveries, debug_info = analyze_text(text, f"Test Text {i}")
    all_results.append({
        'text': text,
        'db_matches': db_matches,
        'ner_discoveries': ner_discoveries,
        'debug_info': debug_info
    })

## 5. Test Consistency - Run Same Text Multiple Times

In [None]:
def test_consistency(text, runs=3):
    """Test if the same text produces consistent results across multiple runs"""
    print(f"\n{'='*60}")
    print(f"CONSISTENCY TEST - {runs} RUNS")
    print(f"{'='*60}")
    print(f"TEXT: {text[:100]}...")
    
    results = []
    for run in range(runs):
        print(f"\n--- Run {run + 1} ---")
        db_matches, ner_discoveries, debug_info = extractor.extract_organizations_debug(text, debug=False)
        
        run_result = {
            'run': run + 1,
            'db_matches': len(db_matches),
            'ner_discoveries': len(ner_discoveries),
            'db_orgs': [m['canonical'] for m in db_matches],
            'new_orgs': [d['text'] for d in ner_discoveries],
            'fuzzy_matches': [m for m in db_matches if m.get('method') == 'ner_fuzzy']
        }
        
        results.append(run_result)
        
        print(f"  DB matches: {run_result['db_matches']}")
        print(f"  New discoveries: {run_result['ner_discoveries']}")
        print(f"  Fuzzy matches: {len(run_result['fuzzy_matches'])}")
        
        if run_result['new_orgs']:
            print(f"  New orgs: {run_result['new_orgs']}")
    
    # Check consistency
    print(f"\nüìä CONSISTENCY ANALYSIS:")
    
    # Check if counts are consistent
    db_counts = [r['db_matches'] for r in results]
    discovery_counts = [r['ner_discoveries'] for r in results]
    
    db_consistent = len(set(db_counts)) == 1
    discovery_consistent = len(set(discovery_counts)) == 1
    
    print(f"  DB match counts: {db_counts} - {'‚úÖ Consistent' if db_consistent else '‚ùå Inconsistent'}")
    print(f"  Discovery counts: {discovery_counts} - {'‚úÖ Consistent' if discovery_consistent else '‚ùå Inconsistent'}")
    
    # Check if specific organizations are consistent
    all_db_orgs = set()
    all_new_orgs = set()
    
    for r in results:
        all_db_orgs.update(r['db_orgs'])
        all_new_orgs.update(r['new_orgs'])
    
    # Check for organizations that appear in some runs but not others
    inconsistent_db = []
    inconsistent_new = []
    
    for org in all_db_orgs:
        appearances = sum(1 for r in results if org in r['db_orgs'])
        if appearances != runs:
            inconsistent_db.append((org, appearances))
    
    for org in all_new_orgs:
        appearances = sum(1 for r in results if org in r['new_orgs'])
        if appearances != runs:
            inconsistent_new.append((org, appearances))
    
    if inconsistent_db:
        print(f"\n‚ùå INCONSISTENT DB MATCHES:")
        for org, count in inconsistent_db:
            print(f"  '{org}' appeared in {count}/{runs} runs")
    
    if inconsistent_new:
        print(f"\n‚ùå INCONSISTENT NEW DISCOVERIES:")
        for org, count in inconsistent_new:
            print(f"  '{org}' appeared in {count}/{runs} runs")
    
    if not inconsistent_db and not inconsistent_new:
        print(f"\n‚úÖ ALL RESULTS PERFECTLY CONSISTENT!")
    
    return results

# Test consistency on the first test text
consistency_results = test_consistency(test_texts[0], runs=5)

## 6. Test Fuzzy Matching Behavior

In [None]:
def test_fuzzy_matching():
    """Test fuzzy matching behavior with edge cases"""
    print(f"\n{'='*60}")
    print(f"FUZZY MATCHING TESTS")
    print(f"{'='*60}")
    
    # Test cases with variations of known company names
    fuzzy_test_cases = [
        # Slight variations that should match
        "Microsoft Corp announced new products",
        "Apple Computer reported strong earnings", 
        "Google Inc is expanding globally",
        "Amazon Web Services launched new features",
        
        # Edge cases near the 85% threshold
        "Microsft Corporation has a typo",  # Typo
        "Appple Inc has double p",  # Typo
        "Micros Corp is very short",  # Shortened
        "Apple Technology Inc",  # Added words
        
        # Cases that shouldn't match
        "Microsoft Technologies Solutions Group",  # Too different
        "Apple Fruit Company",  # Different context
    ]
    
    for i, test_case in enumerate(fuzzy_test_cases, 1):
        print(f"\nTest {i}: {test_case}")
        
        db_matches, ner_discoveries, debug_info = extractor.extract_organizations_debug(test_case, debug=True)
        
        # Show what was found by NER first
        ner_orgs = [pred['word'].strip() for pred in debug_info['ner_predictions'] if 'ORG' in pred.get('entity_group', '')]
        print(f"  NER found: {ner_orgs}")
        
        # Show fuzzy matching results
        for fuzzy in debug_info['fuzzy_matches']:
            result = "‚Üí DB MATCH" if fuzzy['fuzzy_score'] >= 85 else "‚Üí NEW ORG"
            print(f"  '{fuzzy['org_text']}' ~ '{fuzzy['best_match']}' ({fuzzy['fuzzy_score']}%) {result}")
        
        # Show final categorization
        if db_matches:
            fuzzy_matches = [m for m in db_matches if m.get('method') == 'ner_fuzzy']
            if fuzzy_matches:
                print(f"  ‚úÖ Fuzzy matched to DB: {[m['canonical'] for m in fuzzy_matches]}")
        
        if ner_discoveries:
            print(f"  üÜï New discoveries: {[d['text'] for d in ner_discoveries]}")
        
        if not db_matches and not ner_discoveries:
            print(f"  ‚ùå No organizations found")

test_fuzzy_matching()

## 7. Interactive Testing

In [None]:
def interactive_test():
    """Interactive testing function"""
    print("\nüß™ INTERACTIVE TESTING")
    print("Enter text to analyze (or 'quit' to stop):")
    
    while True:
        try:
            user_text = input("\nEnter text: ").strip()
            
            if user_text.lower() in ['quit', 'exit', 'q']:
                print("Goodbye!")
                break
            
            if not user_text:
                continue
            
            db_matches, ner_discoveries, debug_info = analyze_text(user_text, "Interactive Test")
            
        except KeyboardInterrupt:
            print("\nGoodbye!")
            break
        except Exception as e:
            print(f"Error: {e}")

# Uncomment the line below to run interactive testing
# interactive_test()

## 8. Summary and Insights

In [None]:
def create_summary():
    """Create summary of all tests"""
    print(f"\n{'='*60}")
    print(f"TESTING SUMMARY & INSIGHTS")
    print(f"{'='*60}")
    
    print(f"\nüìä EXTRACTOR CONFIGURATION:")
    print(f"  Min confidence: {extractor.min_confidence}")
    print(f"  Min org length: {extractor.min_org_length}")
    print(f"  Fuzzy match threshold: 85%")
    print(f"  Database orgs: {len(extractor.master_orgs_df)}")
    print(f"  Lookup entries: {len(extractor.org_lookup)}")
    print(f"  Known short orgs: {len(extractor.known_short_orgs)}")
    
    # Analyze all test results
    total_db_matches = sum(len(r['db_matches']) for r in all_results)
    total_discoveries = sum(len(r['ner_discoveries']) for r in all_results)
    total_fuzzy_matches = sum(len([m for m in r['db_matches'] if m.get('method') == 'ner_fuzzy']) for r in all_results)
    
    print(f"\nüìà TEST RESULTS ACROSS {len(all_results)} TEXTS:")
    print(f"  Total DB matches: {total_db_matches}")
    print(f"  Total new discoveries: {total_discoveries}")
    print(f"  Fuzzy matches: {total_fuzzy_matches}")
    
    # Show all unique new discoveries
    all_discoveries = set()
    for r in all_results:
        all_discoveries.update(d['text'] for d in r['ner_discoveries'])
    
    if all_discoveries:
        print(f"\nüÜï ALL UNIQUE NEW DISCOVERIES ({len(all_discoveries)}):")
        for discovery in sorted(all_discoveries):
            print(f"  ‚Ä¢ {discovery}")
    
    print(f"\nüí° KEY INSIGHTS FOR DEBUGGING STREAMLIT APP:")
    print(f"  1. Check if fuzzy matching is causing inconsistency (orgs moving between categories)")
    print(f"  2. Verify database connection and org loading")
    print(f"  3. Look for session state issues with approved/rejected lists")
    print(f"  4. Test with your actual documents to see real-world behavior")
    print(f"  5. Monitor fuzzy scores near the 85% threshold for edge cases")
    
    print(f"\nüîß RECOMMENDATIONS:")
    print(f"  ‚Ä¢ Add fuzzy score logging in Streamlit app debug mode")
    print(f"  ‚Ä¢ Consider making fuzzy threshold configurable")
    print(f"  ‚Ä¢ Add consistency checks for multiple runs")
    print(f"  ‚Ä¢ Monitor session state variables more carefully")

create_summary()

## 9. Test with Your Own Text

Use the cell below to test with your own text or document content:

In [None]:
# Replace this with your own test text
your_text = """
Paste your document text here to test the NER discovery functionality.
This could be content from a PDF, Word doc, or any other document you're 
testing in the Streamlit app.
"""

# Uncomment and modify the text above, then run this cell
if your_text.strip() and "Paste your document" not in your_text:
    print("Testing your custom text...")
    db_matches, ner_discoveries, debug_info = analyze_text(your_text, "Your Custom Text")
else:
    print("Replace the text above with your own content to test!")