In [1]:
import os
import sys
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir,".."))
paths_to_add = [project_root,
                os.path.join(project_root,"src","init")]
for path in paths_to_add:
    if path not in sys.path:
        sys.path.append(path)

In [2]:
import psycopg2
from contextlib import contextmanager

@contextmanager
def get_db_connection(connection_string):
    """Context manager for database connections."""
    conn = None
    try:
        conn = psycopg2.connect(connection_string)
        yield conn
    finally:
        if conn:
            conn.close()

def execute_sql_query(query, connection_string):
    """
    Execute a SQL query and return results.
    
    Args:
        query: SQL query string to execute
        connection_string: PostgreSQL connection string
        
    Returns:
        List of results (each row as a tuple), or None if query fails
    """
    try:
        with get_db_connection(connection_string) as conn:
            cursor = conn.cursor()
            cursor.execute(query)
            results = cursor.fetchall()
            return results
    except Exception as e:
        print(f"Error executing query: {e}")
        return None

In [3]:
# check if missing terms are in glossary
from business_glossary import check_glossary_consistency
check_glossary_consistency()

âœ… All terms in synonyms exist in key_terms


In [12]:
from src.init.business_glossary import key_terms, synonyms, related_terms
from difflib import get_close_matches

def search_terms(user_question, key_terms, synonyms, related_terms):
    """
    Searches for key terms, synonyms, and related terms in a user question.
    Uses keyword lookup and fuzzy matching.

    Returns:
        - key_terms: key_terms that exist in the database, including the synonyms and related terms. 1:1 pull from business_glossary as a list of dictionaries.

        - synonym_searched_for: the word/phrase from the user question for which you found a synonym. String or None.

        - synonym: the synonym being found. 1:1 as a dict from key_terms. None if otherwise.
        
        - synonym_exists_in_db (bool): True if synonym exists in database.

        - synonym_docu: (for objects_documentation) for the terms found in the user_question, it shows the synonym that exist in the database (filtered synonyms from business_glossary). format:
        "<term_1> is synonym with <term_2>" or None.
        Where <term_1> is the term in the user question and <term_2> is the synonym that exists in database.
        
        - related_term_searched_for: the word/phrase from the user question for which you found 1 or more related terms that exist in the db. String or None.
        
        - related_term_exists_in_db: True if exactly 1 related term exists in DB.
        
        - related_terms: the related terms that are found in db. 1:1 as a dict from key_terms. None if otherwise.
        
        - related_terms_exists_in_db: True if > 1 related term exists in DB.
        
        - related_terms_docu: (for objects_documentation) for the terms found in the user_question, it shows all related terms that exist in the database (filtered just for the related_terms from business_glossary). format: "<term_1> is related (similar but different) with: <term_2>, <term_3> etc". 
        Where <term_1> is the term in the user question and <term_2> / <term_3> are related terms that exist in db.
        If multiple terms from user question have related terms, each is on a new line with format above.
    """
    user_question_lower = user_question.lower()
    
    # Create lookup dictionaries for fast access
    key_terms_lookup = {term['name'].lower(): term for term in key_terms}
    
    # Initialize all return values
    key_terms_found = []
    synonym_searched_for = None
    synonym = None
    synonym_exists_in_db = False
    synonym_docu = None
    related_term_searched_for = None
    related_term_exists_in_db = False
    related_terms_found = None
    related_terms_exists_in_db = False
    related_terms_docu = None
    
    # Track all related terms matches (can be multiple)
    all_related_matches = []
    
    # 1. Check for direct key terms (keyword and fuzzy match)
    for term in key_terms:
        term_name = term['name'].lower()
        
        # Keyword lookup (exact substring match)
        if term_name in user_question_lower:
            if term.get('exists_in_database', False):
                key_terms_found.append(term)
            continue
        
        # Fuzzy match on the complete phrase
        term_words = term_name.split()
        if len(term_words) > 1:
            # For multi-word terms, try to find the complete phrase with fuzzy matching
            words = user_question_lower.split()
            for i in range(len(words) - len(term_words) + 1):
                phrase = ' '.join(words[i:i+len(term_words)])
                if get_close_matches(term_name, [phrase], n=1, cutoff=0.85):
                    if term.get('exists_in_database', False):
                        key_terms_found.append(term)
                    break
        else:
            # Single word term - fuzzy match directly
            words = user_question_lower.split()
            if get_close_matches(term_name, words, n=1, cutoff=0.85):
                if term.get('exists_in_database', False):
                    key_terms_found.append(term)
    
    # 2. Check for synonyms
    for syn_word, key_term_ref in synonyms.items():
        syn_word_lower = syn_word.lower()
        found_synonym = False
        
        # Keyword lookup
        if syn_word_lower in user_question_lower:
            found_synonym = True
        else:
            # Fuzzy match for synonym (complete phrase)
            syn_words = syn_word_lower.split()
            if len(syn_words) > 1:
                # Multi-word synonym
                words = user_question_lower.split()
                for i in range(len(words) - len(syn_words) + 1):
                    phrase = ' '.join(words[i:i+len(syn_words)])
                    if get_close_matches(syn_word_lower, [phrase], n=1, cutoff=0.85):
                        found_synonym = True
                        break
            else:
                # Single word synonym
                words = user_question_lower.split()
                if get_close_matches(syn_word_lower, words, n=1, cutoff=0.85):
                    found_synonym = True
        
        if found_synonym:
            # Look up the actual key term
            key_term_normalized = key_term_ref.replace('_', ' ').lower()
            if key_term_normalized in key_terms_lookup:
                actual_term = key_terms_lookup[key_term_normalized]
                
                # Check if synonym exists in database
                if actual_term.get('exists_in_database', False):
                    synonym_exists_in_db = True
                    synonym = actual_term
                    synonym_searched_for = syn_word  # The word from user question
                    
                    # Add to key_terms_found
                    if actual_term not in key_terms_found:
                        key_terms_found.append(actual_term)
                    
                    # Create synonym_docu: "<term_1> is synonym with <term_2>"
                    # term_1 = word from user question, term_2 = synonym name that exists in DB
                    syn_name = actual_term.get('name', '')
                    synonym_docu = f"{syn_word} is synonym with {syn_name}"
                    
                    break  # Only process first synonym match
    
    # 3. Check for related terms (process ALL matches, not just first)
    for term_group in related_terms:
        found_term_in_group = None
        found_term_obj = None
        found_term_from_question = None  # Track the actual word/phrase from user question
        
        for related_term in term_group:
            related_term_lower = related_term.replace('_', ' ').lower()
            
            # Keyword lookup
            if related_term_lower in user_question_lower:
                found_term_in_group = related_term_lower
                found_term_from_question = related_term  # Use original casing from glossary
                if related_term_lower in key_terms_lookup:
                    found_term_obj = key_terms_lookup[related_term_lower]
                break
            
            # Fuzzy match (complete phrase)
            related_words = related_term_lower.split()
            if len(related_words) > 1:
                # Multi-word related term
                words = user_question_lower.split()
                for i in range(len(words) - len(related_words) + 1):
                    phrase = ' '.join(words[i:i+len(related_words)])
                    if get_close_matches(related_term_lower, [phrase], n=1, cutoff=0.85):
                        found_term_in_group = related_term_lower
                        found_term_from_question = related_term  # Use original casing from glossary
                        if related_term_lower in key_terms_lookup:
                            found_term_obj = key_terms_lookup[related_term_lower]
                        break
                if found_term_in_group:
                    break
            else:
                # Single word related term
                words = user_question_lower.split()
                if get_close_matches(related_term_lower, words, n=1, cutoff=0.85):
                    found_term_in_group = related_term_lower
                    found_term_from_question = related_term  # Use original casing from glossary
                    if related_term_lower in key_terms_lookup:
                        found_term_obj = key_terms_lookup[related_term_lower]
                    break
        
        # If we found a term in this group, collect all OTHER related terms that exist in DB
        if found_term_in_group:
            related_terms_in_db = []
            for group_term in term_group:
                group_term_normalized = group_term.replace('_', ' ').lower()
                # Only add if it's NOT the term we found
                if group_term_normalized != found_term_in_group:
                    if group_term_normalized in key_terms_lookup:
                        term_data = key_terms_lookup[group_term_normalized]
                        # Only include if exists in database
                        if term_data.get('exists_in_database', False):
                            related_terms_in_db.append(term_data)
                            # Add to key_terms_found
                            if term_data not in key_terms_found:
                                key_terms_found.append(term_data)
            
            # Store this match
            if related_terms_in_db:
                all_related_matches.append({
                    'term_from_question': found_term_from_question,
                    'related_terms_in_db': related_terms_in_db
                })
            
            # Continue to check other term groups (don't break)
    
    # Now process all related term matches
    if all_related_matches:
        # Combine counts from all matches
        total_related_count = sum(len(match['related_terms_in_db']) for match in all_related_matches)
        
        if total_related_count == 1:
            # Exactly 1 related term found across all matches
            related_term_exists_in_db = True
            related_term_searched_for = all_related_matches[0]['term_from_question']
            related_terms_found = all_related_matches[0]['related_terms_in_db'][0]  # Single dict
            
            # Create related_terms_docu
            rel_name = all_related_matches[0]['related_terms_in_db'][0].get('name', '')
            related_terms_docu = f"{related_term_searched_for} is related (similar but different) with: {rel_name}"
            
        elif total_related_count > 1:
            # Multiple related terms found
            related_terms_exists_in_db = True
            
            # Collect all related terms
            all_related_terms = []
            for match in all_related_matches:
                all_related_terms.extend(match['related_terms_in_db'])
            related_terms_found = all_related_terms  # List of dicts
            
            # For related_term_searched_for, use first match
            related_term_searched_for = all_related_matches[0]['term_from_question']
            
            # Create related_terms_docu with multiple lines if multiple terms from question
            docu_lines = []
            for match in all_related_matches:
                term_from_q = match['term_from_question']
                rel_names = [t.get('name', '') for t in match['related_terms_in_db']]
                docu_lines.append(f"{term_from_q} is related (similar but different) with: {', '.join(rel_names)}")
            related_terms_docu = '\n'.join(docu_lines)
    
    return {
        'key_terms': key_terms_found,
        'synonym_searched_for': synonym_searched_for,
        'synonym': synonym,
        'synonym_exists_in_db': synonym_exists_in_db,
        'synonym_docu': synonym_docu,
        'related_term_searched_for': related_term_searched_for,
        'related_term_exists_in_db': related_term_exists_in_db,
        'related_terms': related_terms_found,
        'related_terms_exists_in_db': related_terms_exists_in_db,
        'related_terms_docu': related_terms_docu
    }

In [None]:
# User asks for a vague term (undefined at org level) but related to multiple terms available in db 
user_question = 'Payments associated with advisors from firm Cedar Capital LLC'
# expecting to enter disambiguation node and say that payment can mean net revenue (Revenue retained by Capital Partners) or payout (Dollar amount paid to advisor), and ask the user which one it prefers.
search_terms_output = search_terms(user_question, key_terms, synonyms, related_terms)
search_terms_output

In [None]:
# User asking with lazy language: 
user_question = 'net revenue associated with Cedar Capital advisors'
# expecting to filter for sum of net revenue for advisors belonging to the firm Cedar Capital LLC
search_terms_output = search_terms(user_question, key_terms, synonyms, related_terms)
search_terms_output

In [None]:
# User asks for a term not in db (but defined) but related to multiple terms available in db 
user_question = 'distribution of advisor ID 8'
# expecting to enter disambiguation node and say that distribution (advisor payout after tech fees are deducted) is not in tables it has access to, 
# but it can offer net revenue (Revenue retained by Capital Partners) or payout (Dollar amount paid to advisor), which one you prefer?
search_terms_output = search_terms(user_question, key_terms, synonyms, related_terms)
search_terms_output

In [None]:
# User asks for a vague term (undefined at org level) but related to a single terms available in db: producing assets (vague): 
user_question = 'producing assets for annuity business line'
# expecting to say to query for advisory assets and say that advisory assets is Assets in Managed Portfolio and SMA business lines.
search_terms_output = search_terms(user_question, key_terms, synonyms, related_terms)
search_terms_output

In [None]:
# User asks for a term not in db (but defined) but related to a single terms available in db
user_question = 'liquid assets of custody accounts'
# expecting to say it doesn't have access to liquid assets (assets easily converted to cash) but here are the results for advisory assets (Assets in Managed Portfolio and SMA business lines).
# filter for account_type = 'Custody', account.account_status = 'Active' and account.to_date = '9999-12-31'
search_terms_output = search_terms(user_question, key_terms, synonyms, related_terms)
search_terms_output

In [None]:
# User asks for a term not in db , wit no synonyms and no related terms in db: 

In [None]:
user_question = 'List advisors under age 40 who have more than $300M in total assets.'
search_terms_output = search_terms(user_question, key_terms, synonyms, related_terms)
search_terms_output

In [None]:
user_question = 'List advisors under age 40 who have more than $300M in compensation'
search_terms_output = search_terms(user_question, key_terms, synonyms, related_terms)
search_terms_output