## Simi Search with CoA

In [None]:
from app.services.vectorise_data import get_embedding

response = await get_embedding("salary payment to developers", 'retrieval.query')

In [None]:
import voyageai
import os 
from dotenv import load_dotenv
from typing import Literal
load_dotenv()

voyage_api_key = os.getenv("VOYAGE_API_KEY")

client = voyageai.Client(api_key=voyage_api_key)
input_type = Literal["document", "query"]

response = client.embed(
    texts="salary payment to developers",
    model="voyage-finance-2",
    input_type="document"
)
response.embeddings

In [None]:
from app.services.xero import vectorise_coa

await vectorise_coa("ALL")

### Import bank data from csv

In [None]:
import pandasai as pai
import pandas as pd
import ast
from dotenv import load_dotenv
import os 

load_dotenv()

# pai_api_key = os.getenv("OPENAI_API_KEY")
# if not pai_api_key:
#     raise ValueError("PAI_API_KEY is not set")
# else:
#     print(f"OPENAI_API_KEY is set")

df = pd.read_csv("bank_data_23-25/barclays_072.csv")

# # Sample DataFrame
# df = pai.DataFrame(df)

# pai.api_key.set(pai_api_key)


### extract currency and amount
df['currency'] = df['transactionAmount'].apply(lambda x: ast.literal_eval(x)['currency'])
df['amount'] = df['transactionAmount'].apply(lambda x: float(ast.literal_eval(x)['amount']))
df['amount'] = (df['amount'] * 100).astype(int)
df = df.drop('transactionAmount', axis=1)

## setting datetimeindex
df['bookingDate'] = pd.to_datetime(df['bookingDate'])
df.set_index('bookingDate', inplace=True)
df = df.drop(['valueDate', 'bookingDateTime', 'valueDateTime', 'internalTransactionId'], axis=1)
df = df.rename(columns={'remittanceInformationUnstructured': 'remittanceInfo'})

# df.chat("What are the total expenses for 2024?")


### LLM Reconciliation Time

- Add new columns "coa_agent", "coa_reason", "coa_agent_confidence" for LLM


In [None]:
""" classifiy transaction belonging to TransactionClassifier """
def classify_transaction(self, transaction: Dict, chart_of_accounts: list) -> Tuple[str, str, float]:
    """Classify a single transaction using the LLM"""
    logger.info(f"Processing transaction: {transaction}")
    self._rate_limit()

    # Format the transaction details for the LLM
    transaction_prompt = f"""
    Please classify the following transaction:
    Transaction: {transaction}
    
    chart of accounts: {chart_of_accounts}
    """

    messages = [
        SystemMessage(content=self.system_prompt),
        HumanMessage(content=transaction_prompt)
    ]

    try:
        logger.debug("Sending request to LLM")
        # Add more visible print statements
        print("\n=== Sending request to LLM ===")
        response = openai_model.invoke(messages)
        print("\n=== Raw LLM Response ===")
        print(response.content)
        logger.debug(f"Raw LLM response: {response.content}")

        # Try to parse the response as JSON
        try:
            # First, try direct JSON parsing
            result = json.loads(response.content)
            logger.info("Successfully parsed JSON response")
        except json.JSONDecodeError:
            # If direct parsing fails, try to extract JSON from the response
            logger.warning("Direct JSON parsing failed, attempting to extract JSON from response")
            # Look for JSON-like structure in the response
            import re
            json_match = re.search(r'\{.*\}', response.content, re.DOTALL)
            if json_match:
                result = json.loads(json_match.group())
                logger.info("Successfully extracted and parsed JSON from response")
            else:
                raise ValueError("No JSON structure found in response")

        # Validate the response structure
        required_keys = {'account', 'reasoning', 'confidence'}
        if not all(key in result for key in required_keys):
            missing_keys = required_keys - result.keys()
            raise ValueError(f"Missing required keys in response: {missing_keys}")

        logger.info(f"Classification successful: {result['account']}")
        return (
            result['account'],
            result['reasoning'],
            result['confidence']
        )

    except Exception as e:
        logger.error(f"Classification failed: {str(e)}", exc_info=True)
        return (
            "ERROR",
            f"Classification failed: {str(e)}",
            0.0
        )

In [24]:
""" fetch transactions + ntropy enrich from supabase """
from pydantic import BaseModel
import pandas as pd

from app.services.supabase import get_supabase
supabase = await get_supabase()

class TransactionToLLM(BaseModel):
    entity_name : str
    amount : float
    remittance_info : str
    ntropy_enrich : bool
    ntropy_entity : str
    ntropy_category : str

# Query both tables
ntropy_response = await supabase.table('ntropy_transactions').select('*').execute()
gocardless_response = await supabase.table('gocardless_transactions').select('*').execute()

# Convert to DataFrames
ntropy_df = pd.DataFrame(ntropy_response.data)
gocardless_df = pd.DataFrame(gocardless_response.data)

# Merge DataFrames on the specified keys
merged_df = pd.merge(
    gocardless_df,
    ntropy_df,
    how='left',
    left_on='id',
    right_on='ntropy_id'
)

# Create a new DataFrame with selected columns
result_df = pd.DataFrame({
    'id': merged_df['id'],
    'entity_name': merged_df.apply(
        lambda row: row['creditor_name'] if pd.notnull(row['creditor_name'])
        else row['debtor_name'] if pd.notnull(row['debtor_name'])
        else row['enriched_data']['entities']['counterparty']['name'] if pd.notnull(row['enriched_data'])
        else None,
        axis=1
    ),
    'amount': merged_df['amount']/100,
    'remittance_info': merged_df['remittance_info'],
    'ntropy_enrich': merged_df['enriched_data'].notnull(),
    'ntropy_entity': merged_df['enriched_data'].apply(lambda x: x['entities']['counterparty']['name'] if pd.notnull(x) else None),
    'ntropy_category': merged_df['enriched_data'].apply(lambda x: x['categories']['general'] if pd.notnull(x) else None)
})



2025-02-18 15:56:34,675 - INFO - HTTP Request: GET https://lvhbpccylcxeehgxtrwv.supabase.co/rest/v1/ntropy_transactions?select=%2A "HTTP/2 200 OK"
2025-02-18 15:56:34,803 - INFO - HTTP Request: GET https://lvhbpccylcxeehgxtrwv.supabase.co/rest/v1/gocardless_transactions?select=%2A "HTTP/2 200 OK"


In [48]:
""" LLM inference for reconciliation"""

from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
from dotenv import load_dotenv
import time
from typing import Dict, Tuple, List
import logging
import json

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

load_dotenv()

# Initialize the ChatGroq model
groq_model = ChatGroq(model_name="Llama-3.3-70b-Specdec")
openai_model = ChatOpenAI(model="gpt-4o")


class TransactionClassifier:
    def __init__(self):
        logger.info("Initializing TransactionClassifier")
        self.system_prompt = """
        You are a helpful financial expert responsible for classifying transactions.
        You will be given multiple transactions and a list of chart of accounts to reconcile. 

        For each transaction, you must provide:
        1. A brief thought process and explanation of which chart of account is appropriate for this transaction
        2. The code for the most appropriate chart of account
        3. A confidence score between 0 and 1 (e.g., 0.95 for high confidence, 0.40 for low confidence)

        # IMPORTANT: Instructions for your response:
        You must respond with valid JSON in the following format only:
        {
          "classifications": [
            {
              "transaction_index": 0,
              "reasoning": "string",
              "account": "string",
              "confidence": float
            },
            {
              "transaction_index": 1,
              "reasoning": "string",
              "account": "string",
              "confidence": float
            },
            {
              "transaction_index": 2,
              "reasoning": "string",
              "account": "string",
              "confidence": float
            },
          ]
        }
        """
        self.last_request_time = 0
        self.rate_limit_delay = 2  # 2 seconds between requests (30 requests/minute)

    def _rate_limit(self):
        """Implement rate limiting"""
        current_time = time.time()
        time_since_last_request = current_time - self.last_request_time
        if time_since_last_request < self.rate_limit_delay:
            delay = self.rate_limit_delay - time_since_last_request
            logger.debug(f"Rate limiting: waiting {delay:.2f} seconds")
            time.sleep(delay)
        self.last_request_time = time.time() 

    def classify_transactions_batch(self, transactions: List[Dict], chart_of_accounts: list) -> List[Tuple[str, str, float]]:
        """Classify a batch of transactions using the LLM"""
        logger.info(f"Starting classification of batch with {len(transactions)} transactions")
        logger.debug(f"Transactions to classify: {json.dumps(transactions, indent=2)}")
        logger.debug(f"Number of chart of accounts: {len(chart_of_accounts)}")
        
        self._rate_limit()

        # Format the transactions for the LLM
        transactions_prompt = f"""
        Please classify the following transactions:
        Transactions: {transactions}
        
        chart of accounts: {chart_of_accounts}
        """
        logger.debug(f"Generated prompt: {transactions_prompt}")

        messages = [
            SystemMessage(content=self.system_prompt),
            HumanMessage(content=transactions_prompt)
        ]

        try:
            logger.info("Sending request to LLM")
            response = openai_model.invoke(messages)
            logger.debug(f"Raw LLM response: {response.content}")

            try:
                import re
                json_match = re.search(r'\{.*\}', response.content, re.DOTALL)
                if json_match:
                    logger.debug("Found JSON structure in response")
                    result = json.loads(json_match.group())
                    classifications = result.get('classifications', [])
                    logger.info(f"Successfully parsed {len(classifications)} classifications")
                    logger.debug(f"Classifications: {json.dumps(classifications, indent=2)}")
                    
                    # Convert to list of tuples
                    return [
                        (c['account'], c['reasoning'], c['confidence'])
                        for c in sorted(classifications, key=lambda x: x['transaction_index'])
                    ]
                else:
                    logger.error("No JSON structure found in response")
                    return [(f"ERROR: No JSON found", "", 0.0)] * len(transactions)
                    
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse JSON response: {e}")
                logger.debug(f"Problematic response content: {response.content}")
                return [(f"ERROR: {str(e)}", "", 0.0)] * len(transactions)

        except Exception as e:
            logger.error(f"Batch classification failed: {str(e)}", exc_info=True)
            return [(f"ERROR: {str(e)}", "", 0.0)] * len(transactions)

""" fetch transactions + ntropy enrich from supabase """
from pydantic import BaseModel
import pandas as pd

from app.services.supabase import get_supabase

class TransactionToLLM(BaseModel):
    entity_name : str
    amount : float
    remittance_info : str
    ntropy_enrich : bool
    ntropy_entity : str
    ntropy_category : str

def process_transactions(df: pd.DataFrame, chart_of_accounts: list) -> pd.DataFrame:
    """Process transactions grouped by remittance_info in batches of 3 groups"""
    logger.info(f"Starting to process {len(df)} transactions")
    logger.debug(f"Input DataFrame shape: {df.shape}")
    
    classifier = TransactionClassifier()

    # Create copy of DataFrame with new columns
    df = df.copy()
    df['coa_agent'] = None
    df['coa_reason'] = None
    df['coa_confidence'] = None

    # Group by remittance_info and get representative samples
    grouped = df.groupby('remittance_info')
    unique_groups = list(grouped.groups.keys())
    
    # Process groups in batches of 3
    batch_size = 3
    total_batches = (len(unique_groups) + batch_size - 1) // batch_size
    
    stop_counter = 0
    for i in range(0, len(unique_groups), batch_size):
        batch_end = min(i + batch_size, len(unique_groups))
        current_batch = (i // batch_size) + 1
        logger.info(f"Processing batch {current_batch}/{total_batches} (groups {i} to {batch_end-1})")
        
        # Get representative transactions from each group in this batch
        batch_transactions = []
        group_indices = []
        
        for remittance_info in unique_groups[i:batch_end]:
            group_df = grouped.get_group(remittance_info)
            # Take the first transaction as representative for the group
            representative = group_df.iloc[0]
            batch_transactions.append({
                **representative.to_dict(),
                'amount': representative['amount'],
                'group_size': len(group_df)
            })
            group_indices.append(remittance_info)
        
        # Get classifications for the batch
        classifications = classifier.classify_transactions_batch(batch_transactions, chart_of_accounts)
        
        # Apply classifications to all transactions in each group
        for remittance_info, (account, reasoning, confidence) in zip(group_indices, classifications):
            mask = df['remittance_info'] == remittance_info
            df.loc[mask, 'coa_agent'] = account
            df.loc[mask, 'coa_reason'] = reasoning
            df.loc[mask, 'coa_confidence'] = confidence
            
            group_size = len(df[mask])
            logger.info(f"Applied classification to group '{remittance_info}' ({group_size} transactions):")
            logger.info(f"Account: {account}")
            logger.info(f"Confidence: {confidence}")
            logger.info(f"Reason Preview: {' '.join(reasoning.split()[:10])}...")
            logger.info("-" * 50)

        stop_counter += 1
        if stop_counter > 5:
            break
    logger.info("Finished processing all transaction groups")
    logger.debug(f"Final DataFrame shape: {df.shape}")
    return df

async def fetch_and_prepare_transactions() -> pd.DataFrame:
    """Fetch transactions from Supabase and prepare DataFrame"""
    logger.info("Starting to fetch transactions from Supabase")
    
    try:
        # Get Supabase client
        supabase = await get_supabase()
        logger.info("Successfully connected to Supabase")
        
        # Query both tables asynchronously
        logger.info("Querying Supabase tables...")
        ntropy_response = await supabase.table('ntropy_transactions').select('*').execute()
        gocardless_response = await supabase.table('gocardless_transactions').select('*').execute()
        
        logger.info(f"Retrieved {len(ntropy_response.data)} ntropy transactions")
        logger.info(f"Retrieved {len(gocardless_response.data)} gocardless transactions")
        
        # Convert to DataFrames
        ntropy_df = pd.DataFrame(ntropy_response.data)
        gocardless_df = pd.DataFrame(gocardless_response.data)
        
        # Merge DataFrames on the specified keys
        merged_df = pd.merge(
            gocardless_df,
            ntropy_df,
            how='left',
            left_on='id',
            right_on='ntropy_id'
        )
        
        # Create a new DataFrame with selected columns
        result_df = pd.DataFrame({
            'id': merged_df['id'],
            'entity_name': merged_df.apply(
                lambda row: row['creditor_name'] if pd.notnull(row['creditor_name'])
                else row['debtor_name'] if pd.notnull(row['debtor_name'])
                else row['enriched_data']['entities']['counterparty']['name'] if pd.notnull(row['enriched_data'])
                else None,
                axis=1
            ),
            'amount': merged_df['amount']/100,
            'remittance_info': merged_df['remittance_info'],
            'ntropy_enrich': merged_df['enriched_data'].notnull(),
            'ntropy_entity': merged_df['enriched_data'].apply(
                lambda x: x['entities']['counterparty']['name'] if pd.notnull(x) else None
            ),
            'ntropy_category': merged_df['enriched_data'].apply(
                lambda x: x['categories']['general'] if pd.notnull(x) else None
            )
        })
        
        logger.info(f"Final prepared DataFrame contains {len(result_df)} rows")
        logger.debug(f"DataFrame columns: {result_df.columns.tolist()}")
        
        return result_df
        
    except Exception as e:
        logger.error("Error in fetch_and_prepare_transactions", exc_info=True)
        raise

async def main():
    """Main async function to orchestrate the reconciliation process"""
    logger.info("Starting reconciliation process")
    
    try:
        # Load chart of accounts
        logger.info("Loading chart of accounts from coa.json")
        with open('coa.json', 'r') as file:
            data = json.load(file)
        
        # Extract only the required fields from each account
        parsed_accounts = [
            {
                'code': account.get('Code', ''),
                'name': account.get('Name', ''),
                'type': account.get('Type', ''),
                'description': account.get('Description', ''),
                'class': account.get('Class', ''),
            }
            for account in data['Accounts']
            if account.get('Status') == 'ACTIVE'
        ]
        logger.info(f"Loaded {len(parsed_accounts)} active accounts from CoA")
        
        # Fetch and prepare transactions
        logger.info("Fetching and preparing transactions")
        result_df = await fetch_and_prepare_transactions()
        logger.info(f"Retrieved {len(result_df)} transactions to process")
        
        # Process transactions
        logger.info("Starting transaction processing")
        df_reconciled = process_transactions(result_df, parsed_accounts)
        logger.info("Completed reconciliation process")
        
        return df_reconciled
        
    except Exception as e:
        logger.error("Error in main function", exc_info=True)
        raise

if __name__ == "__main__":
    import asyncio
    df_reconciled = asyncio.run(main())
    print(df_reconciled)


2025-02-18 16:32:09,204 - INFO - Starting reconciliation process
2025-02-18 16:32:09,205 - INFO - Loading chart of accounts from coa.json
2025-02-18 16:32:09,206 - INFO - Loaded 83 active accounts from CoA
2025-02-18 16:32:09,206 - INFO - Fetching and preparing transactions
2025-02-18 16:32:09,207 - INFO - Starting to fetch transactions from Supabase
2025-02-18 16:32:09,207 - INFO - Successfully connected to Supabase
2025-02-18 16:32:09,207 - INFO - Querying Supabase tables...
2025-02-18 16:32:09,301 - INFO - HTTP Request: GET https://lvhbpccylcxeehgxtrwv.supabase.co/rest/v1/ntropy_transactions?select=%2A "HTTP/2 200 OK"
2025-02-18 16:32:09,370 - INFO - HTTP Request: GET https://lvhbpccylcxeehgxtrwv.supabase.co/rest/v1/gocardless_transactions?select=%2A "HTTP/2 200 OK"
2025-02-18 16:32:09,372 - INFO - Retrieved 10 ntropy transactions
2025-02-18 16:32:09,373 - INFO - Retrieved 126 gocardless transactions
2025-02-18 16:32:09,380 - INFO - Final prepared DataFrame contains 126 rows
2025-02

Unnamed: 0,id,entity_name,amount,remittance_info,ntropy_enrich,ntropy_entity,ntropy_category,coa_agent,coa_reason,coa_confidence
0,2c37580c-5bd5-45a2-a2bb-2f4371b1199c9ef65f8b-e...,,172.37,PAYMENT RECEIVED - THANK YOU,False,,,260,This transaction indicates a payment received ...,0.8
1,99e0c076-c6c3-490b-85a2-3687ac9c79551b84f6b4-2...,SOHO HOUSE WHITE CITY,-46.60,3CPAYMENT*WHITE CITY HO LONDON,False,,,420,The transaction is a payment to 'SOHO HOUSE WH...,0.7
2,daa2a72e-9e19-41dd-92ac-b57eb0fbc3c75d494b7c-2...,WESTFIELD LONDON - CAR PARK,-8.50,WESTFIELD LONDON - CAR LONDON,False,,,449,This transaction involves a payment for a car ...,0.85
3,9bc38ce4-c82a-473b-8498-7fe5a49ffcf7f5c10402-4...,AUDIBLE,-7.99,AUDIBLE UK ADBL.CO/PYMT,False,,,485,The transaction is identified as a payment to ...,0.95
4,c928468a-1500-439a-a6a1-3500daf43476bba85042-7...,METRICOOL,-18.39,METRICOOL.COM MADRID,False,,,463,"The transaction is a payment to 'METRICOOL', i...",0.9
...,...,...,...,...,...,...,...,...,...,...
121,8db6dbd2-862e-40c5-8140-d8e11b63cfc062e2e853-9...,FLOWON.AI,-0.99,FLOWON.AI LONDON,False,,,,,
122,36760625-422b-4d79-93a2-cdb844da08be92e37836-6...,METRICOOL,-18.29,METRICOOL.COM MADRID,False,,,,,
123,42b5a4d9-bf37-417a-a716-138718236d17af9852a7-3...,AUDIBLE,-7.99,AUDIBLE UK ADBL.CO/PYMT,False,,,,,
124,bbf5786a-75f9-4b4c-93d5-550cdd6cd73f98707b70-8...,HEATHROW NORTH SERVICE STATION,-25.26,EUG00840 HEATHROW NORTH Hayes,False,,,,,


In [49]:
df_reconciled

Unnamed: 0,id,entity_name,amount,remittance_info,ntropy_enrich,ntropy_entity,ntropy_category,coa_agent,coa_reason,coa_confidence
0,2c37580c-5bd5-45a2-a2bb-2f4371b1199c9ef65f8b-e...,,172.37,PAYMENT RECEIVED - THANK YOU,False,,,260,This transaction indicates a payment received ...,0.8
1,99e0c076-c6c3-490b-85a2-3687ac9c79551b84f6b4-2...,SOHO HOUSE WHITE CITY,-46.60,3CPAYMENT*WHITE CITY HO LONDON,False,,,420,The transaction is a payment to 'SOHO HOUSE WH...,0.7
2,daa2a72e-9e19-41dd-92ac-b57eb0fbc3c75d494b7c-2...,WESTFIELD LONDON - CAR PARK,-8.50,WESTFIELD LONDON - CAR LONDON,False,,,449,This transaction involves a payment for a car ...,0.85
3,9bc38ce4-c82a-473b-8498-7fe5a49ffcf7f5c10402-4...,AUDIBLE,-7.99,AUDIBLE UK ADBL.CO/PYMT,False,,,485,The transaction is identified as a payment to ...,0.95
4,c928468a-1500-439a-a6a1-3500daf43476bba85042-7...,METRICOOL,-18.39,METRICOOL.COM MADRID,False,,,463,"The transaction is a payment to 'METRICOOL', i...",0.9
...,...,...,...,...,...,...,...,...,...,...
121,8db6dbd2-862e-40c5-8140-d8e11b63cfc062e2e853-9...,FLOWON.AI,-0.99,FLOWON.AI LONDON,False,,,,,
122,36760625-422b-4d79-93a2-cdb844da08be92e37836-6...,METRICOOL,-18.29,METRICOOL.COM MADRID,False,,,,,
123,42b5a4d9-bf37-417a-a716-138718236d17af9852a7-3...,AUDIBLE,-7.99,AUDIBLE UK ADBL.CO/PYMT,False,,,,,
124,bbf5786a-75f9-4b4c-93d5-550cdd6cd73f98707b70-8...,HEATHROW NORTH SERVICE STATION,-25.26,EUG00840 HEATHROW NORTH Hayes,False,,,,,


In [None]:
""" LLM inference for reconciliation"""

from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
from dotenv import load_dotenv
import time
from typing import Dict, Tuple, List
import logging
import json

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

load_dotenv()

# Initialize the ChatGroq model
groq_model = ChatGroq(model_name="Llama-3.3-70b-Specdec")
openai_model = ChatOpenAI(model="gpt-4o")


class TransactionClassifier:
    def __init__(self):
        logger.info("Initializing TransactionClassifier")
        self.system_prompt = """
        You are a helpful financial expert responsible for classifying transactions.
        You will be given multiple transactions and a list of chart of accounts to reconcile. 

        For each transaction, you must provide:
        1. A brief thought process and explanation of which chart of account is appropriate for this transaction
        2. The code for the most appropriate chart of account
        3. A confidence score between 0 and 1 (e.g., 0.95 for high confidence, 0.40 for low confidence)

        # IMPORTANT: Instructions for your response:
        You must respond with valid JSON in the following format only:
        {
          "classifications": [
            {
              "transaction_index": 0,
              "reasoning": "string",
              "account": "string",
              "confidence": float
            },
            {
              "transaction_index": 1,
              "reasoning": "string",
              "account": "string",
              "confidence": float
            },
            {
              "transaction_index": 2,
              "reasoning": "string",
              "account": "string",
              "confidence": float
            },
          ]
        }
        """
        self.last_request_time = 0
        self.rate_limit_delay = 2  # 2 seconds between requests (30 requests/minute)

    def _rate_limit(self):
        """Implement rate limiting"""
        current_time = time.time()
        time_since_last_request = current_time - self.last_request_time
        if time_since_last_request < self.rate_limit_delay:
            delay = self.rate_limit_delay - time_since_last_request
            logger.debug(f"Rate limiting: waiting {delay:.2f} seconds")
            time.sleep(delay)
        self.last_request_time = time.time() 

    def classify_transactions_batch(self, transactions: List[Dict], chart_of_accounts: list) -> List[Tuple[str, str, float]]:
            """Classify a batch of transactions using the LLM"""
            logger.info(f"Processing batch of {len(transactions)} transactions")
            self._rate_limit()

            # Format the transactions for the LLM
            transactions_prompt = f"""
            Please classify the following transactions:
            Transactions: {transactions}
            
            chart of accounts: {chart_of_accounts}
            """

            messages = [
                SystemMessage(content=self.system_prompt),
                HumanMessage(content=transactions_prompt)
            ]

            try:
                print("\n=== Sending batch request to LLM ===")
                response = openai_model.invoke(messages)
                print("\n=== Raw LLM Response ===")
                print(response.content)

                # Improved JSON parsing logic
                try:
                    # First, try to extract JSON from the response
                    import re
                    json_match = re.search(r'\{.*\}', response.content, re.DOTALL)
                    if json_match:
                        result = json.loads(json_match.group())
                        classifications = result.get('classifications', [])
                        logger.info(f"Successfully parsed {len(classifications)} classifications")
                        
                        # Convert to list of tuples
                        return [
                            (c['account'], c['reasoning'], c['confidence'])
                            for c in sorted(classifications, key=lambda x: x['transaction_index'])
                        ]
                    else:
                        logger.error("No JSON structure found in response")
                        return [(f"ERROR: No JSON found", "", 0.0)] * len(transactions)
                        
                except json.JSONDecodeError as e:
                    logger.error(f"Failed to parse JSON response: {e}")
                    return [(f"ERROR: {str(e)}", "", 0.0)] * len(transactions)

            except Exception as e:
                logger.error(f"Batch classification failed: {str(e)}", exc_info=True)
                return [(f"ERROR: {str(e)}", "", 0.0)] * len(transactions)


def process_transactions(df: pd.DataFrame, chart_of_accounts: list) -> pd.DataFrame:
    """Process transactions in DataFrame in batches of 3"""
    logger.info(f"Starting to process {len(df)} transactions")
    classifier = TransactionClassifier()

    df = df.copy()
    df['coa_agent'] = None
    df['coa_reason'] = None
    df['coa_confidence'] = None

    # Process in batches of 3
    batch_size = 3
    for i in range(0, len(df), batch_size):
        batch_end = min(i + batch_size, len(df))
        batch_rows = df.iloc[i:batch_end]
        
        # Convert batch to list of dictionaries
        batch_transactions = [
            {**row.to_dict(), 'amount': row['amount']/100}
            for _, row in batch_rows.iterrows()
        ]
        
        # Get classifications for the batch
        classifications: list = classifier.classify_transactions_batch(batch_transactions, chart_of_accounts)
        
        # Update DataFrame with results
        for j, (account, reasoning, confidence) in enumerate(classifications):
            idx = i + j
            if idx < len(df):  # Ensure we don't go past the end of the DataFrame
                df.iloc[idx, df.columns.get_loc('coa_agent')] = account
                df.iloc[idx, df.columns.get_loc('coa_reason')] = reasoning
                df.iloc[idx, df.columns.get_loc('coa_confidence')] = confidence
                
                # Print summary
                reason_preview = ' '.join(reasoning.split()[:10]) + '...' if reasoning else 'No reason provided'
                logger.info(f"Transaction {idx}:")
                logger.info(f"Account: {account}")
                logger.info(f"Reason Preview: {reason_preview}")
                logger.info("-" * 50)
    logger.info("Finished processing all transactions")
    return df

""" Fetch and store chart of accounts into parsed_accounts """
import json

# Read the JSON file
with open('coa.json', 'r') as file:
    data = json.load(file)

# Extract only the required fields from each account
parsed_accounts = []
for account in data['Accounts']:
    if account.get('Status') == 'ACTIVE':
        parsed_account = {
            'code': account.get('Code', ''),
            'name': account.get('Name', ''),
            'type': account.get('Type', ''),
            'description': account.get('Description', ''),  # Note: Not present in sample but included as requested
            'class': account.get('Class', ''),
    }
    parsed_accounts.append(parsed_account)

df_reconciled = process_transactions(result_df, parsed_accounts)

df_reconciled

In [5]:
df_reconciled.to_csv("transactions_ai_reconciled.csv")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x13f979090>

In [None]:
""" fuzzy matching clustering"""
from thefuzz import fuzz
import numpy as np
from sklearn.cluster import DBSCAN

def calculate_similarity(str1, str2):
    # Split strings into merchant name and location
    parts1 = str1.strip().split()
    parts2 = str2.strip().split()
    
    # Get the main merchant part (before any location)
    merchant1 = ' '.join([p for p in parts1 if 'LONDON' not in p and 'CO.UK' not in p])
    merchant2 = ' '.join([p for p in parts2 if 'LONDON' not in p and 'CO.UK' not in p])
    
    # Calculate different similarity metrics
    ratio = fuzz.ratio(merchant1, merchant2)
    token_sort_ratio = fuzz.token_sort_ratio(merchant1, merchant2)
    token_set_ratio = fuzz.token_set_ratio(merchant1, merchant2)
    
    # Check if they share common prefixes
    prefix_match = merchant1.split('*')[0] == merchant2.split('*')[0]
    prefix_bonus = 20 if prefix_match else 0
    
    # Calculate weighted similarity
    similarity = (ratio + token_sort_ratio + token_set_ratio) / 3 + prefix_bonus
    
    # Location penalty - if one has LONDON and other doesn't, reduce similarity
    if ('LONDON' in str1) != ('LONDON' in str2):
        similarity *= 0.5
    
    return max(0, min(100, similarity))  # Ensure similarity is between 0 and 100

def create_distance_matrix(merchant_names):
    n = len(merchant_names)
    distance_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(n):
            if i == j:
                distance_matrix[i, j] = 0
            else:
                similarity = calculate_similarity(merchant_names[i], merchant_names[j])
                # Convert similarity to distance (ensure non-negative)
                distance_matrix[i, j] = max(0, (100 - similarity) / 100)
    
    return distance_matrix

def cluster_similar_merchants(df, eps=0.3, min_samples=2):
    # Get unique merchant names
    merchant_names = df['remittance_info'].unique()
    
    # Create distance matrix
    distances = create_distance_matrix(merchant_names)
    
    # Perform DBSCAN clustering
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
    clusters = clustering.fit_predict(distances)
    
    # Create mapping of merchants to their cluster
    merchant_clusters = {}
    for merchant, cluster_id in zip(merchant_names, clusters):
        if cluster_id != -1:  # -1 represents noise in DBSCAN
            merchant_clusters[merchant] = cluster_id
    
    # Add cluster labels to original dataframe
    df_grouped = df.copy()
    df_grouped['cluster'] = df_grouped['remittance_info'].map(merchant_clusters)
    
    return df_grouped

# Apply clustering
clustered_df = cluster_similar_merchants(result_df)

# Show clusters with their transactions and amounts
for cluster in sorted(clustered_df['cluster'].unique()):
    if cluster is not None:  # Skip unmatched transactions
        cluster_data = clustered_df[clustered_df['cluster'] == cluster]
        merchants = cluster_data['remittance_info'].unique()
        
        if len(merchants) > 1:  # Only show groups with multiple merchants
            total_amount = cluster_data['amount'].sum()
            print(f"\nCluster {cluster} (Total amount: {total_amount}):")
            for merchant in merchants:
                amount = cluster_data[cluster_data['remittance_info'] == merchant]['amount'].sum()
                print(f"  {merchant:<40} Amount: {amount}")


Cluster 0.0 (Total amount: -128.51999999999998):
  AMZNMKTPLACE*SV3139IO5  AMAZON.CO.UK     Amount: -17.98
  AMZNMKTPLACE*A37DG8PS5  AMAZON.CO.UK     Amount: -28.88
  AMZNMKTPLACE*1K8C45SV5  AMAZON.CO.UK     Amount: -19.98
  AMZNMKTPLACE*BX8LY0GQ5  AMAZON.CO.UK     Amount: -15.98
  AMZNMKTPLACE*E50UR0V15  AMAZON.CO.UK     Amount: -32.52
  AMZNMKTPLACE*T95QP19W4  AMAZON.CO.UK     Amount: -13.18

Cluster 1.0 (Total amount: -53.940000000000005):
  AMAZON PRIME*VV9BJ3IL5  AMZN.CO.UK/PM    Amount: -17.98
  AMAZON PRIME*O99TT0FV5  AMZN.CO.UK/PM    Amount: -17.98
  AMAZON PRIME*TE26Y4ZT4  AMZN.CO.UK/PM    Amount: -17.98

Cluster 2.0 (Total amount: -67.78):
  AMAZON.CO.UK*ZP8HS0JD5  AMAZON.CO.UK     Amount: -27.0
  AMAZON.CO.UK*D54RH4I65  AMAZON.CO.UK     Amount: -40.78

Cluster 3.0 (Total amount: -109.98000000000002):
  GREGGS HEATHROW NORTH   HAYES            Amount: -9.4
  EUG00840 HEATHROW NORTH Hayes            Amount: -100.58000000000001

Cluster 4.0 (Total amount: -35.940000000000005):

In [None]:
" NTROPY "
import requests

ntropy_api_key = os.getenv("NTROPY_API_KEY")
if not ntropy_api_key:
    raise ValueError("NTROPY_API_KEY is not set")
else:
    print(f"NTROPY_API_KEY is set")

""" CREATE NEW ACCOUNT HOLDER """
url = "https://api.ntropy.com/v3/account_holders"
headers = {
    "Accept": "application/json",
    "X-API-KEY": ntropy_api_key,
    "Content-Type": "application/json"
}

data = {
    "id": "35b927b6-6fda-40aa-93b8-95b47c2b2cad",
    "type": "business",
    "name": "Michael Ali",
    "website": "https://flowon.ai",
    "industry": "ai software"
}

response = requests.post(url, headers=headers, json=data)
print(response.json())


In [None]:
""" NTROPY BATCH PROCESS TRANSACTIONS"""
import uuid

url = "https://api.ntropy.com/v3/batches/"


data = {
        "operation": "POST /v3/transactions",
        "data": transformed_data
    }

response = requests.post(url, headers=headers, json=data)
print(response.json())


In [None]:
""" NOW CHECK TRANSACTION STATUS """
batch_id = "1a2bc613-111b-49b1-b35c-77e9b1d7a2fc"

url = f"https://api.ntropy.com/v3/batches/{batch_id}/results"


get_batch = requests.get(url, headers=headers)

get_batch.json()

In [None]:
get_batch.json()['results'][0]

In [None]:

transaction_id = "1177539c-b570-4588-9953-d76ae4647afb"

url = f"https://api.ntropy.com/v3/transactions/{transaction_id}"

get_transaction = requests.get(url, headers=headers)

get_transaction.json()

In [None]:
import json 

""" DF TO JSON FIELDS """
# Convert DataFrame to JSON
def prepare_df_for_frontend(df):
    # Reset index to make bookingDate a column
    df = df.reset_index()
    
    # Convert datetime to ISO format string
    df['bookingDate'] = df['bookingDate'].dt.strftime('%Y-%m-%dT%H:%M:%S')
    
    # Convert to JSON records format (this gives us a string)
    json_string = df.to_json(orient='records', date_format='iso')
    
    # Parse the JSON string into Python objects (list of dictionaries)
    json_data = json.loads(json_string)
    
    return json_data

json_data = prepare_df_for_frontend(df)



In [None]:
import uuid 
"""  """
def transform_transaction(transaction, account_holder_id):
    # Transform a single transaction
    return {
        "id": str(uuid.uuid4()),
        "description": transaction["remittanceInfo"],
        "date": transaction["bookingDate"].split("T")[0],
        "amount": abs(transaction["amount"]/100),  # Make amount positive
        "entry_type": "outgoing" if transaction["amount"] < 0 else "incoming",
        "currency": transaction["currency"],
        "account_holder_id": account_holder_id,
        "location": {
            "country": "GB"
        }
    }

# Example usage:
account_holder_id = "35b927b6-6fda-40aa-93b8-95b47c2b2cad"
# Transform all transactions using list comprehension
transformed_data = [
    transform_transaction(transaction, account_holder_id) 
    for transaction in json_data
]

transformed_data

## To evaluate reponses of LLM

In [6]:
df_processed.to_csv("transactions_ai_reconciled.csv")