<a href="https://colab.research.google.com/github/pastrop/kaggle/blob/master/transacton_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install anthropic
!pip install base64

In [2]:
import anthropic
import pandas as pd
import json
import base64
from typing import List, Dict, Optional
from datetime import datetime
from pathlib import Path

In [3]:
class ContractParser:
    """Extracts commission structures from PDF contracts using Claude API"""

    def __init__(self, api_key: str):
        self.client = anthropic.Anthropic(api_key=api_key)

    def parse_contract(self, pdf_path: str) -> Dict:
        """
        Parse a PDF contract and extract rates and fees information.

        Args:
            pdf_path: Path to the PDF contract file

        Returns:
            Dictionary with structured contract data
        """
        # Read and encode PDF
        with open(pdf_path, 'rb') as f:
            pdf_data = base64.standard_b64encode(f.read()).decode('utf-8')

        # Prompt for structured extraction
        extraction_prompt = """
        Analyze this contract and extract the fees and rates information.

        Return a JSON object containing any rate or fee information (no markdown, no preamble).
        The exrtacted data should include information about:
        Analyze this contract and extract all fee and rate-related information. For each fee or rate, provide:

        Fee/Rate Name or Type (e.g., transaction fee, monthly fee, interchange fee, processing rate)
        Amount or Percentage (exact values, including any tiered structures)
        Calculation Method (how it's applied - per transaction, monthly, annual, etc.)
        Conditions or Tiers (if rates vary based on volume, transaction type, or other factors)
        Currency (if specified)
        Effective Date (when rates apply or change)

        Also identify:

        Any minimum or maximum fee caps
        Setup or onboarding fees
        Termination or cancellation fees
        Late payment penalties or interest rates
        Volume-based discounts or pricing tiers
        Any fees that may be waived under certain conditions

        If  rates or fees are percentage-based, convert to decimal (e.g., 5% -> 0.05).
        If rates or fees depends on the region, currency, Alternative Payment Methods
        (digital wallets, bank transfers, etc., not cash/cards),  return all types
        """

        # Call Claude API with PDF
        message = self.client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=4096,
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "document",
                        "source": {
                            "type": "base64",
                            "media_type": "application/pdf",
                            "data": pdf_data
                        }
                    },
                    {
                        "type": "text",
                        "text": extraction_prompt
                    }
                ]
            }]
        )

        # Extract and parse JSON response
        response_text = message.content[0].text

        # Clean response (remove any markdown fencing if present)
        response_text = response_text.strip()
        if response_text.startswith('```json'):
            response_text = response_text[7:]
        if response_text.startswith('```'):
            response_text = response_text[3:]
        if response_text.endswith('```'):
            response_text = response_text[:-3]

        contract_data = json.loads(response_text.strip())
        contract_data['source_file'] = pdf_path

        return contract_data

    def parse_multiple_contracts(self, pdf_directory: str) -> List[Dict]:
        """Parse all PDFs in a directory"""
        contracts = []
        pdf_dir = Path(pdf_directory)

        for pdf_file in pdf_dir.glob('*.pdf'):
            print(f"Parsing {pdf_file.name}...")
            try:
                contract_data = self.parse_contract(str(pdf_file))
                contracts.append(contract_data)
            except Exception as e:
                print(f"Error parsing {pdf_file.name}: {e}")

        return contracts

In [8]:
from google.colab import userdata

# Replace 'YOUR_SECRET_NAME' with the actual name of your secret in Colab Secrets
# For example, if you stored an API key named 'ANTHROPIC_API_KEY',
# you would use: api_key = userdata.get('ANTHROPIC_API_KEY')

# Example:
API_KEY = userdata.get('Antropic')
print(f"Retrieved API Key: {API_KEY[:5]}...") # Print a part of the key for security

Retrieved API Key: sk-an...


In [10]:
# Step 1: Parse contracts
print("=== Parsing Contracts ===")
parser = ContractParser(API_KEY)
contracts = parser.parse_contract("/content/faro_test.pdf")

# Save parsed contracts
with open('parsed_contracts.json', 'w') as f:
    json.dump(contracts, f, indent=2)

=== Parsing Contracts ===
