<a href="https://colab.research.google.com/github/pastrop/kaggle/blob/master/transacton_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install anthropic
!pip install base64

In [None]:
import anthropic
import base64

In [None]:
import pandas as pd
import json
from typing import List, Dict, Optional
#from datetime import datetime
from pathlib import Path

In [None]:
class ContractParserSend:
    """Extracts commission structures from PDF contracts using Claude API"""

    def __init__(self, api_key: str):
        self.client = anthropic.Anthropic(api_key=api_key)

    def parse_contract(self, pdf_path: str) -> Dict:
        """
        Parse a PDF contract and extract rates and fees information.

        Args:
            pdf_path: Path to the PDF contract file

        Returns:
            Dictionary with structured contract data
        """
        # Read and encode PDF
        with open(pdf_path, 'rb') as f:
            pdf_data = base64.standard_b64encode(f.read()).decode('utf-8')

        # Prompt for structured extraction
        extraction_prompt = """
        Analyze this contract and extract the fees and rates information.

        Return a JSON object containing any rate or fee information (no markdown, no preamble).
        Analyze this contract and extract all fees, commisions, renumneration and rate-related information.
        For each of the above being found, provide:

        Fee/Rate Name or Type (e.g., transaction fee, monthly fee, interchange fee, processing rate, renumeration, etc.,)
        Amount or Percentage (exact values, including any tiered structures)
        Calculation Method if specified (how it's applied - per transaction, monthly, annual, etc.)
        Conditions or Tiers (if rates vary based on volume, transaction type, or other factors)
        Currency (if specified)
        Effective Date (when rates apply or change)

        Also identify:

        Any minimum or maximum fee caps
        Setup or onboarding fees
        Termination or cancellation fees
        Late payment penalties or interest rates
        Volume-based discounts or pricing tiers
        Any fees that may be waived under certain conditions

        If  rates or fees are percentage-based, convert to decimal (e.g., 5% -> 0.05).
        If rates or fees depends on the region, currency, Alternative Payment Methods
        (digital wallets, bank transfers, etc., not cash/cards),  return all types
        """

        # Call Claude API with PDF
        message = self.client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=4096,
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "document",
                        "source": {
                            "type": "base64",
                            "media_type": "application/pdf",
                            "data": pdf_data
                        }
                    },
                    {
                        "type": "text",
                        "text": extraction_prompt
                    }
                ]
            }]
        )

        # Extract and parse JSON response
        response_text = message.content[0].text

        # Clean response (remove any markdown fencing if present)
        response_text = response_text.strip()
        if response_text.startswith('```json'):
            response_text = response_text[7:]
        if response_text.startswith('```'):
            response_text = response_text[3:]
        if response_text.endswith('```'):
            response_text = response_text[:-3]

        contract_data = json.loads(response_text.strip())
        contract_data['source_file'] = pdf_path

        return contract_data

    def parse_multiple_contracts(self, pdf_directory: str) -> List[Dict]:
        """Parse all PDFs in a directory"""
        contracts = []
        pdf_dir = Path(pdf_directory)

        for pdf_file in pdf_dir.glob('*.pdf'):
            print(f"Parsing {pdf_file.name}...")
            try:
                contract_data = self.parse_contract(str(pdf_file))
                contracts.append(contract_data)
            except Exception as e:
                print(f"Error parsing {pdf_file.name}: {e}")

        return contracts

In [None]:
class ContractParserSend_all_contracts_fromJSON:
    """Extracts commission structures from PDF contracts using Claude API"""

    def __init__(self, api_key: str):
        self.client = anthropic.Anthropic(api_key=api_key)

    def parse_contract(self, pdf_path: str) -> Dict:
        """
        Parse a PDF contract and extract rates and fees information.

        Args:
            pdf_path: Path to the PDF contract file

        Returns:
            Dictionary with structured contract data
        """
        # Read and encode PDF
        with open(pdf_path, 'rb') as f:
            pdf_data = base64.standard_b64encode(f.read()).decode('utf-8')

        # Prompt for structured extraction
        extraction_prompt = """
        Analyze this contract and extract the fees and rates information.

        Return a JSON object containing any rate or fee information (no markdown, no preamble).
        Analyze this contract and extract all fees, commisions, renumneration and rate-related information.
        For each of the above being found, provide:

        Fee/Rate Name or Type (e.g., transaction fee, monthly fee, interchange fee, processing rate, renumeration, etc.,)
        Amount or Percentage (exact values, including any tiered structures)
        Calculation Method if specified (how it's applied - per transaction, monthly, annual, etc.)
        Conditions or Tiers (if rates vary based on volume, transaction type, or other factors)
        Currency (if specified)
        Effective Date (when rates apply or change)

        Also identify:

        Any minimum or maximum fee caps
        Setup or onboarding fees
        Termination or cancellation fees
        Late payment penalties or interest rates
        Volume-based discounts or pricing tiers
        Any fees that may be waived under certain conditions

        If  rates or fees are percentage-based, convert to decimal (e.g., 5% -> 0.05).
        If rates or fees depends on the region, currency, Alternative Payment Methods
        (digital wallets, bank transfers, etc., not cash/cards),  return all types
        """

        # Call Claude API with PDF
        message = self.client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=4096,
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "document",
                        "source": {
                            "type": "base64",
                            "media_type": "application/pdf",
                            "data": pdf_data
                        }
                    },
                    {
                        "type": "text",
                        "text": extraction_prompt
                    }
                ]
            }]
        )

        # Extract and parse JSON response
        response_text = message.content[0].text

        # Clean response (remove any markdown fencing if present)
        response_text = response_text.strip()
        if response_text.startswith('```json'):
            response_text = response_text[7:]
        if response_text.startswith('```'):
            response_text = response_text[3:]
        if response_text.endswith('```'):
            response_text = response_text[:-3]

        contract_data = json.loads(response_text.strip())
        contract_data['source_file'] = pdf_path

        return contract_data

    def load_contracts(json_file: str = "contracts_data.json") -> List[Dict[str, Any]]:
    """
    Load contracts data from JSON file.

    Args:
        json_file: Path to the JSON file (default: "contracts_data.json")

    Returns:
        List of contract dictionaries

    Raises:
        FileNotFoundError: If the JSON file doesn't exist
        json.JSONDecodeError: If the JSON file is invalid
        KeyError: If the 'contracts' key is missing from the JSON
    """
    json_path = Path(json_file)

    if not json_path.exists():
        raise FileNotFoundError(f"JSON file not found: {json_path}")

    with json_path.open('r', encoding='utf-8') as f:
        data = json.load(f)

    if 'contracts' not in data:
        raise KeyError("'contracts' key not found in JSON file")

    contracts = data['contracts']

    return contracts



    def parse_multiple_contracts(self, pdf_directory: str) -> List[Dict]:
        """Parse all PDFs in a directory"""
        contracts = self.load_contracts()

        for contract in contracts:
            print(f"Parsing {contract['file_name']}...")
            try:
                contract_data = self.parse_contract(contract['pdf_data'])
                with open('{contract['file_name']}.json', 'w') as f:
                  json.dump(contract_data, f, indent=2)

            except Exception as e:
                print(f"Error parsing {pdf_file.name}: {e}")

        return

In [None]:
from google.colab import userdata

# Replace 'YOUR_SECRET_NAME' with the actual name of your secret in Colab Secrets
# For example, if you stored an API key named 'ANTHROPIC_API_KEY',
# you would use: api_key = userdata.get('ANTHROPIC_API_KEY')

# Example:
API_KEY = userdata.get('Antropic')
print(f"Retrieved API Key: {API_KEY[:5]}...") # Print a part of the key for security

Retrieved API Key: sk-an...


In [None]:
# Step 1: Parse contracts
print("=== Parsing Contracts ===")
parser = ContractParserSend(API_KEY)
contracts = parser.parse_contract("/content/PIXTHENA.pdf")

# Save parsed contracts
with open('parsed_contracts.json', 'w') as f:
    json.dump(contracts, f, indent=2)

=== Parsing Contracts ===


# Code Snippets

In [2]:
import os

# 1. Define your path
directory = '/content/my_data'
file_path = os.path.join(directory, 'another output.txt')

# 2. Create the directory if it doesn't exist
os.makedirs(directory, exist_ok=True)

# 3. Write the file
with open(file_path, 'w') as f:
    f.write("Hello from Google Colab!")

print(f"File saved to: {file_path}")

File saved to: /content/my_data/another output.txt


In [None]:
def load_contracts(json_file: str = "contracts_data.json") -> List[Dict[str, Any]]:
    """
    Load contracts data from JSON file.

    Args:
        json_file: Path to the JSON file (default: "contracts_data.json")

    Returns:
        List of contract dictionaries

    Raises:
        FileNotFoundError: If the JSON file doesn't exist
        json.JSONDecodeError: If the JSON file is invalid
        KeyError: If the 'contracts' key is missing from the JSON
    """
    json_path = Path(json_file)

    if not json_path.exists():
        raise FileNotFoundError(f"JSON file not found: {json_path}")

    with json_path.open('r', encoding='utf-8') as f:
        data = json.load(f)

    if 'contracts' not in data:
        raise KeyError("'contracts' key not found in JSON file")

    contracts = data['contracts']

    return contracts
contracts = load_contracts()

In [None]:
class ContractParser:
    """Extracts commission structures from PDF contracts using Claude API"""

    def __init__(self, api_key: str):
        self.client = anthropic.Anthropic(api_key=api_key)

    def parse_contract(self, pdf_path: str) -> Dict:
        """
        Parse a PDF contract and extract rates and fees information.

        Args:
            pdf_path: Path to the PDF contract file

        Returns:
            Dictionary with structured contract data
        """
        # Read and encode PDF
        with open(pdf_path, 'rb') as f:
            pdf_data = base64.standard_b64encode(f.read()).decode('utf-8')

        # Prompt for structured extraction
        extraction_prompt = """
        Analyze this contract and extract the fees and rates information.

        Return a JSON object containing any rate or fee information (no markdown, no preamble).
        The exrtacted data should include information about:
        Analyze this contract and extract all fee and rate-related information. For each fee or rate, provide:

        Fee/Rate Name or Type (e.g., transaction fee, monthly fee, interchange fee, processing rate)
        Amount or Percentage (exact values, including any tiered structures)
        Calculation Method (how it's applied - per transaction, monthly, annual, etc.)
        Conditions or Tiers (if rates vary based on volume, transaction type, or other factors)
        Currency (if specified)
        Effective Date (when rates apply or change)

        Also identify:

        Any minimum or maximum fee caps
        Setup or onboarding fees
        Termination or cancellation fees
        Late payment penalties or interest rates
        Volume-based discounts or pricing tiers
        Any fees that may be waived under certain conditions

        If  rates or fees are percentage-based, convert to decimal (e.g., 5% -> 0.05).
        If rates or fees depends on the region, currency, Alternative Payment Methods
        (digital wallets, bank transfers, etc., not cash/cards),  return all types
        """

        # Call Claude API with PDF
        message = self.client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=4096,
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "document",
                        "source": {
                            "type": "base64",
                            "media_type": "application/pdf",
                            "data": pdf_data
                        }
                    },
                    {
                        "type": "text",
                        "text": extraction_prompt
                    }
                ]
            }]
        )

        # Extract and parse JSON response
        response_text = message.content[0].text

        # Clean response (remove any markdown fencing if present)
        response_text = response_text.strip()
        if response_text.startswith('```json'):
            response_text = response_text[7:]
        if response_text.startswith('```'):
            response_text = response_text[3:]
        if response_text.endswith('```'):
            response_text = response_text[:-3]

        contract_data = json.loads(response_text.strip())
        contract_data['source_file'] = pdf_path

        return contract_data

    def parse_multiple_contracts(self, pdf_directory: str) -> List[Dict]:
        """Parse all PDFs in a directory"""
        contracts = []
        pdf_dir = Path(pdf_directory)

        for pdf_file in pdf_dir.glob('*.pdf'):
            print(f"Parsing {pdf_file.name}...")
            try:
                contract_data = self.parse_contract(str(pdf_file))
                contracts.append(contract_data)
            except Exception as e:
                print(f"Error parsing {pdf_file.name}: {e}")

        return contracts