In [1]:
import os
import sys
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir,".."))
paths_to_add = [project_root,
                os.path.join(project_root,"src","init")]
for path in paths_to_add:
    if path not in sys.path:
        sys.path.append(path)

In [2]:
import psycopg2
from contextlib import contextmanager

@contextmanager
def get_db_connection(connection_string):
    """Context manager for database connections."""
    conn = None
    try:
        conn = psycopg2.connect(connection_string)
        yield conn
    finally:
        if conn:
            conn.close()

def execute_sql_query(query, connection_string):
    """
    Execute a SQL query and return results.
    
    Args:
        query: SQL query string to execute
        connection_string: PostgreSQL connection string
        
    Returns:
        List of results (each row as a tuple), or None if query fails
    """
    try:
        with get_db_connection(connection_string) as conn:
            cursor = conn.cursor()
            cursor.execute(query)
            results = cursor.fetchall()
            return results
    except Exception as e:
        print(f"Error executing query: {e}")
        return None

In [None]:
# build database_schema_context
# source: database_schema dict

from src.init.database_schema import database_schema, table_relationships
from src.init.business_glossary import key_terms

# connection to db
connection_string = os.getenv('CONNECTION_STRING_DB')

# Build enhanced database_schema_context with column values
database_schema_context = []
date_range_entries = []

# Add all tables with all their columns AND values
for table in database_schema:
    table_name = table['table_name']
    table_desc = table['table_description']
    
    # Start with table info
    table_text = f"Table {table_name}: {table_desc}\n"
    table_text += "Columns:\n"
    
    # Add all columns for this table
    for column_name, column_info in table['columns'].items():
        table_text += f"  - Column {column_name}: {column_info['description']}\n"
        
        # Check if column has a query to get values
        query = column_info.get('query_to_get_column_values', '')
        if query and query.strip():
            # Execute query to get column values
            results = execute_sql_query(query, connection_string)
            if results:
                # Extract values from results
                column_values = []
                for row in results:
                    if row and row[0] is not None:
                        column_values.append(str(row[0]))
                
                # Add values as tab-separated string
                values_str = ' | '.join(column_values)
                table_text += f"    Values in column {column_name}: {values_str}\n"
        
        # Check if column has a query to get date range
        date_query = column_info.get('query_to_get_date_range', '')
        if date_query and date_query.strip():
            # Execute query to get date range
            results = execute_sql_query(date_query, connection_string)
            if results and results[0] and results[0][0] is not None:
                date_info = str(results[0][0])
                date_range_entries.append(f"  - Table {table_name}, column {column_name}: {date_info}\n")
    
    database_schema_context.append(table_text)

# Add ALL table relationships
relationships_text = "\nRelationships between Tables:\n"
for rel in table_relationships:
    relationships_text += f"  {rel['key1']} -> {rel['key2']}\n"
database_schema_context.append(relationships_text)

# Add date range information
if date_range_entries:
    date_range = "\nImportant considerations about dates available:\n"
    date_range += "".join(date_range_entries)
    database_schema_context.append(date_range)

# Add key terms with query instructions
key_terms_text = "\nQuery instructions for key terms:\n"
for term in key_terms:
    term_name = term['name']
    term_definition = term['definition']
    query_instructions = term['query_instructions']
    
    if term_definition:
        key_terms_text += f"  - {term_name}: {term_definition}\n"
    else:
        key_terms_text += f"  - {term_name}\n"
    
    if query_instructions:
        key_terms_text += f"    {query_instructions}\n"

database_schema_context.append(key_terms_text)

# Join all parts
database_schema_context = "\n".join(database_schema_context)

print(database_schema_context)

In [2]:
# check if missing terms are in glossary
from business_glossary import check_glossary_consistency
check_glossary_consistency()

âœ… All terms in synonyms and related_terms exist in key_terms


In [3]:
from src.init.business_glossary import key_terms, synonyms, related_terms
from difflib import get_close_matches

def search_terms(user_question, key_terms, synonyms, related_terms):
    """
    Searches for key terms, synonyms, and related terms in a user question.
    Uses keyword lookup and fuzzy matching.
    
    Args:
        user_question: The user's question string
        key_terms: List of key term dictionaries
        synonyms: Dictionary mapping synonyms to key terms
        related_terms: List of related term groups
        
    Returns:
        dict with 'key_terms_found', 'synonyms_found', 'related_terms_found', 
        'synonym_searched_for', 'term_searched_for_related'
    """
    user_question_lower = user_question.lower()
    
    key_terms_found = []
    synonyms_found = []
    related_terms_found = []
    synonym_searched_for = None  # Track which synonym was found in query
    term_searched_for_related = None  # Track which term triggered related terms
    
    # Create lookup dictionaries for fast access
    key_terms_lookup = {term['name'].lower(): term for term in key_terms}
    
    # 1. Check for direct key terms (keyword and fuzzy match)
    for term in key_terms:
        term_name = term['name'].lower()
        
        # Keyword lookup (exact substring match)
        if term_name in user_question_lower:
            key_terms_found.append(term)
            continue
        
        # Fuzzy match on the complete phrase
        term_words = term_name.split()
        if len(term_words) > 1:
            # For multi-word terms, try to find the complete phrase with fuzzy matching
            # Generate n-grams from user question
            words = user_question_lower.split()
            for i in range(len(words) - len(term_words) + 1):
                phrase = ' '.join(words[i:i+len(term_words)])
                # Use fuzzy matching on the complete phrase
                if get_close_matches(term_name, [phrase], n=1, cutoff=0.85):
                    key_terms_found.append(term)
                    break
        else:
            # Single word term - fuzzy match directly
            words = user_question_lower.split()
            if get_close_matches(term_name, words, n=1, cutoff=0.85):
                key_terms_found.append(term)
    
    # 2. Check for synonyms
    for synonym, key_term_ref in synonyms.items():
        synonym_lower = synonym.lower()
        found_synonym = False
        
        # Keyword lookup
        if synonym_lower in user_question_lower:
            # Look up the actual key term
            key_term_normalized = key_term_ref.replace('_', ' ').lower()
            if key_term_normalized in key_terms_lookup:
                term_data = key_terms_lookup[key_term_normalized]
                if term_data not in synonyms_found:
                    synonyms_found.append(term_data)
                    synonym_searched_for = synonym  # Track the synonym found
                continue
        
        # Fuzzy match for synonym (complete phrase)
        synonym_words = synonym_lower.split()
        if len(synonym_words) > 1:
            # Multi-word synonym
            words = user_question_lower.split()
            for i in range(len(words) - len(synonym_words) + 1):
                phrase = ' '.join(words[i:i+len(synonym_words)])
                if get_close_matches(synonym_lower, [phrase], n=1, cutoff=0.85):
                    key_term_normalized = key_term_ref.replace('_', ' ').lower()
                    if key_term_normalized in key_terms_lookup:
                        term_data = key_terms_lookup[key_term_normalized]
                        if term_data not in synonyms_found:
                            synonyms_found.append(term_data)
                            synonym_searched_for = synonym  # Track the synonym found
                    break
        else:
            # Single word synonym
            words = user_question_lower.split()
            if get_close_matches(synonym_lower, words, n=1, cutoff=0.85):
                key_term_normalized = key_term_ref.replace('_', ' ').lower()
                if key_term_normalized in key_terms_lookup:
                    term_data = key_terms_lookup[key_term_normalized]
                    if term_data not in synonyms_found:
                        synonyms_found.append(term_data)
                        synonym_searched_for = synonym  # Track the synonym found
    
    # 3. Check for related terms
    for term_group in related_terms:
        found_term_in_group = None
        
        for related_term in term_group:
            related_term_lower = related_term.replace('_', ' ').lower()
            
            # Keyword lookup
            if related_term_lower in user_question_lower:
                found_term_in_group = related_term_lower
                break
            
            # Fuzzy match (complete phrase)
            related_words = related_term_lower.split()
            if len(related_words) > 1:
                # Multi-word related term
                words = user_question_lower.split()
                for i in range(len(words) - len(related_words) + 1):
                    phrase = ' '.join(words[i:i+len(related_words)])
                    if get_close_matches(related_term_lower, [phrase], n=1, cutoff=0.85):
                        found_term_in_group = related_term_lower
                        break
                if found_term_in_group:
                    break
            else:
                # Single word related term
                words = user_question_lower.split()
                if get_close_matches(related_term_lower, words, n=1, cutoff=0.85):
                    found_term_in_group = related_term_lower
                    break
        
        # If we found a term in this group, add all OTHER related terms (excluding the found one)
        if found_term_in_group:
            term_searched_for_related = found_term_in_group  # Track which term triggered related terms
            for group_term in term_group:
                group_term_normalized = group_term.replace('_', ' ').lower()
                # Only add if it's NOT the term we found
                if group_term_normalized != found_term_in_group:
                    if group_term_normalized in key_terms_lookup:
                        term_data = key_terms_lookup[group_term_normalized]
                        # Don't add if already in key_terms_found or synonyms_found
                        if term_data not in related_terms_found and term_data not in key_terms_found and term_data not in synonyms_found:
                            related_terms_found.append(term_data)
    
    return {
        'key_terms_found': key_terms_found,
        'synonyms_found': synonyms_found,
        'related_terms_found': related_terms_found,
        'synonym_searched_for': synonym_searched_for,
        'term_searched_for_related': term_searched_for_related
    }

In [None]:
def terms_helper_function(search_terms_output):
    """
    Analyzes search_terms output to determine if database alternatives exist for terms not in DB.
    
    Args:
        search_terms_output: Dict with 'key_terms_found', 'synonyms_found', 'related_terms_found',
                           'synonym_searched_for', 'term_searched_for_related'
        
    Returns:
        dict with boolean flags and formatted string messages
    """
    key_terms_found = search_terms_output.get('key_terms_found', [])
    synonyms_found = search_terms_output.get('synonyms_found', [])
    related_terms_found = search_terms_output.get('related_terms_found', [])
    synonym_searched_for = search_terms_output.get('synonym_searched_for')
    term_searched_for_related = search_terms_output.get('term_searched_for_related')
    
    # Check if any key term does not exist in database
    key_term_not_in_db = any(not term.get('exists_in_database', False) for term in key_terms_found)
    
    # Initialize flags
    synonym_exists_in_db = False
    related_term_exists_in_db = False
    related_key_terms_exist_in_db = False
    
    # Initialize string outputs
    synonym_text = None
    related_single_term_text = None
    related_multiple_terms_text = None
    
    # (1) Check if synonym exists in DB (evaluate regardless of key_term_not_in_db)
    if synonyms_found:
        synonyms_in_db = [term for term in synonyms_found if term.get('exists_in_database', False)]
        if synonyms_in_db:
            synonym_exists_in_db = True
            # Format: "<synonym_name> is <synonym_description>"
            syn_term = synonyms_in_db[0]
            syn_name = syn_term.get('name', '')
            syn_def = syn_term.get('definition', '')
            if syn_def:
                synonym_text = f"{syn_name} is {syn_def}"
            else:
                synonym_text = f"{syn_name}"
    
    # Only check related terms if at least one key term doesn't exist in DB
    if key_term_not_in_db:
        # (2) and (3) Check related terms that exist in DB
        if related_terms_found:
            related_terms_in_db = [term for term in related_terms_found if term.get('exists_in_database', False)]
            
            if len(related_terms_in_db) == 1:
                related_term_exists_in_db = True
                # Format: "<term_in_query> does not exist in the tables I have access to. 
                #          I returned the data for <related_term_name> which is <related_term_description>"
                rel_term = related_terms_in_db[0]
                rel_name = rel_term.get('name', '')
                rel_def = rel_term.get('definition', '')
                term_in_query = term_searched_for_related if term_searched_for_related else "the requested term"
                
                if rel_def:
                    related_single_term_text = f"{term_in_query} does not exist in the tables I have access to. I returned the data for {rel_name} which is {rel_def}"
                else:
                    related_single_term_text = f"{term_in_query} does not exist in the tables I have access to. I returned the data for {rel_name}"
                    
            elif len(related_terms_in_db) > 1:
                related_key_terms_exist_in_db = True
                # Format: "- <key_term_1_name>: <key_term_1_definition>\n- <key_term_2_name>: <key_term_2_definition>"
                lines = []
                for rel_term in related_terms_in_db:
                    rel_name = rel_term.get('name', '')
                    rel_def = rel_term.get('definition', '')
                    if rel_def:
                        lines.append(f"- {rel_name}: {rel_def}")
                    else:
                        lines.append(f"- {rel_name}")
                related_multiple_terms_text = "\n".join(lines)
    
    return {
        'synonym_exists_in_db': synonym_exists_in_db,
        'related_term_exists_in_db': related_term_exists_in_db,
        'related_key_terms_exist_in_db': related_key_terms_exist_in_db,
        'synonym_text': synonym_text,
        'related_single_term_text': related_single_term_text,
        'related_multiple_terms_text': related_multiple_terms_text
    }

In [11]:
search_terms_output

{'key_terms_found': [{'name': 'Advisor',
   'definition': '',
   'query_instructions': "to get recent records, filter for advisors.advisor_status = 'Active' and advisors.to_date = '9999-12-31'",
   'exists_in_database': True}],
 'synonyms_found': [],
 'related_terms_found': [],
 'synonym_searched_for': None,
 'term_searched_for_related': None}

In [10]:
#user_question = 'What is distinct count of active CFN advisors?'
#user_question = 'What is the total AUM adjusted for a split% of the ID?'
user_question = 'List all advisors affiliated with Main Firm KBK Wealth Management, LLC.'
#user_question = 'Total assets per client and affiliation credit?'
# user_question = 'List advisors under age 40 who have more than $300M in total assets.'
#user_question = 'List advisors under age 40 who have more than $300M in compensation'

search_terms_output = search_terms(user_question, key_terms, synonyms, related_terms)
terms_helper_function(search_terms_output)

{'synonym_exists_in_db': False,
 'related_term_exists_in_db': False,
 'related_key_terms_exist_in_db': False,
 'synonym': None,
 'related_single_term': None,
 'related_multiple_terms': None}