# README
## Notebook Instructions

1. (In Section 1) Insert your Together.AI API key.
2. Make sure that the files 'Stanford_Expenses Pre-Processed.json' and 'Stanford University Expense Policy With Labeled Sections.jsonl' are in the local directory.
3. Add a pdf file of an expense receipt to the local directory. The expense must be of one of the following categories: "Airfare", "Lodging", "Ground Transportation", "Meals", "Other Reimbursable Business Expenses", "Employee Gifts", "Business Meals", and "Travel Meals".
4. (In Section 7) Insert the filepath of the receipt's pdf.
5. Fill in the user information in Section 7: status and date.
5. Run all cells except for those in Section 6.
6. Wait for the last cell to run for the main execution.

# SECTION 1: Load client and packages
[Action Item] Insert your Together.AI API key

In [None]:
!pip install together
!pip install PyPDF2

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
from together import Together
import json
import re
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict, Optional, List
import sys
import PyPDF2
from IPython.display import display, Markdown

## [Action Item] Insert your together API key

In [None]:
client = Together(api_key=FILL HERE)

# SECTION 2: Loading files (Policy and Receipts)

In [None]:
# Function to read a JSONL file
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

In [None]:
def extract_receipt_info(pdf_path):
    """
    Extract information from a receipt PDF and structure it into a JSON format.

    Args:
        pdf_path (str): Path to the PDF receipt

    Returns:
        dict: Structured receipt information
    """
    # Extract text from PDF
    def extract_text_from_pdf(file_path):
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        return text

    # Get raw text from receipt
    receipt_text = extract_text_from_pdf(pdf_path)

    prompt = f"""
    # Instructions
    You are an AI assistant helping to extract information from receipts into a structured JSON format.
    Analyze the receipt text and create a JSON object with relevant information.

    The structure should be:
    {{
      "receipt": {{
        "general_information": {{
          "receipt_id": (if available),
          "receipt_date": (date in any format found),
          "receipt_time": (if available),
          "vendor_name": (business name),
          "vendor_address": (if available),
          "transaction_type": (if specified),
          "payment_method": (if available),
          "currency": (default to "USD" if not specified)
        }},
        "expense_details": {{
          // For standard receipts with subtotal/tax/tip:
          "total_amount": (total including tax and tip),
          "subtotal": (before tax and tip),
          "taxes": (if itemized),
          "discounts": (if any),
          "tip": (if included),
            // OR for transportation:
            // Include any specific fees, charges, or fare details
          }}
        }}
      }}
    }}

    Receipt Text:
    {receipt_text}

    Important:
    - Include only fields where information is available
    - Format numbers as numbers, not strings
    - Format dates and times as strings
    - Use consistent formatting
    - Include category-specific details based on receipt type
    - Maintain proper JSON structure

    Return only valid JSON without any additional text or explanation.
    """

    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
    )

    # Extract JSON from response
    output = response.choices[0].message.content

    # Find JSON using regex
    json_match = re.search(r'({[\s\S]*})', output)
    if json_match:
        try:
            return json.loads(json_match.group())
        except json.JSONDecodeError:
            print("Error: Found JSON-like structure but couldn't parse")
            return {}
    else:
        print("Error: Couldn't find JSON in response")
        return {}

In [None]:
extracted_receipt_json =  {
              "receipt": {
                "general_information": {
                  "receipt_id": "12345",
                  "receipt_date": "10/09/2024",
                  "receipt_time":"9:23 PM",
                  "vendor_name": "Shake Shack",
                  "vendor_address": "459 Lagunita Dr",
                  "transaction_type": "Sale",
                  "payment_method": "VISA 1234",
                  "currency": "USD"
                },
                "expense_details": {
                  "total_amount": 38.88,
                  "subtotal": 32,
                  "taxes": 2.88,
                  "discounts": 0,
                  "tip": 4,
                  "category_specific_details": {
                    "meals": {
                      "location": "Stanford CA",
                      "items": {
                          "1 Burger": 12,
                          "1 Wine": 10,
                          "1 Fries": 10
                      }
                    }
                  }
                }
              }
            }

In [None]:
"""
extracted_receipt_json =  {
              "receipt": {
                "general_information": {
                  "receipt_date": "December 2 2024",
                  "vendor_name": "Uber",
                  "payment_method": "VISA 9130",
                  "currency": "USD",
                  "from": "SFO International Airport",
                  "to": "1035 Campus Drive, Stanford CA 94305",
                },
                "expense_details": {
                  "Trip fair": 37.56,
                  "Booking Fee": 11.61,
                  "SFO Airport Surcharge": 5.50,
                  "Access for All Fee": 0.10,
                  "CA Driver Benefits": 0.32,
                  "Tip": 5.00,
                  "Promotion": -2.75,
                  "Total": 57.34
                }
              }
            }
"""

'\nextracted_receipt_json =  {\n              "receipt": {\n                "general_information": {\n                  "receipt_date": "December 2 2024",\n                  "vendor_name": "Uber",\n                  "payment_method": "VISA 9130",\n                  "currency": "USD",\n                  "from": "SFO International Airport",\n                  "to": "1035 Campus Drive, Stanford CA 94305",\n                },\n                "expense_details": {\n                  "Trip fair": 37.56,\n                  "Booking Fee": 11.61,\n                  "SFO Airport Surcharge": 5.50,\n                  "Access for All Fee": 0.10,\n                  "CA Driver Benefits": 0.32,\n                  "Tip": 5.00,\n                  "Promotion": -2.75,\n                  "Total": 57.34\n                }\n              }\n            }\n'

# SECTION 3: Pre Processing functions

In [None]:
# Function to summarize content using an LLM
def summarize_content(content):
    prompt = f"""
    You are given a segment of an expense policy. Your task is to produce a concise summary that highlights which types of expenses this section applies to, along with the conditions or contexts in which it is relevant. The summary should:

    1. Identify the categories of expenses covered (e.g., travel, lodging, meals, conference fees, equipment).
    2. Highlight any roles or stakeholders mentioned (e.g., employees, faculty, students, guests).
    3. Mention any conditions or constraints (e.g., domestic vs. international travel, allowable amounts, required documentation).
    4. Avoid irrelevant details and focus on information that would help determine whether this section is applicable to a given expense.
    5. Be written as a standalone summary, without referencing the instruction text.
    Policy:
    {content}

    Summary:
    """

    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
    )

    return response.choices[0].message.content

In [None]:
# Function to analyze content and structure it into individual atomic clauses
def analyze_content_to_clauses(content):
    prompt = f"""
        Analyze the following policy content and extract individual atomic clauses. Each clause should correspond to a specific rule that can be applied to evaluating whether an expense is valid or not.
        Format the output as a list of rules.
        For example:
        [\"1. An expense must have a receipt\",\"2. Travel meals should not be more than $50 USD.\"]

        Content to analyze:
        {content}
        Rules:
        """

    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
    )

    return response.choices[0].message.content

In [None]:
def label_section(policy, section_path, organization):
    """
    Break policy sections into logically coherent clauses.
    """
    # Convert section path to policy tree path
    path_parts = [part.strip() for part in section_path.split(">")]

    # Get the content from the policy tree using unpacked arguments
    section_content = policy_tree.get_section(*path_parts).raw_content

    prompt = f"""
    # Instructions
    You are helping with expense auditing for {organization}.
    The goal is to pre-process the expense policy so that an automated system can later check if a receipt complies with the policy.

    For the following section break it up into sentences or logically-connected sentences and label each as one of the following:

    1. "Need user detail to determine if valid expense"
       Use this label when:
       - The clause contains words like "must", "required", "only", "should" that create a requirement
       - The system needs specific information to verify if the requirement was met
       - The requirement could make the expense invalid if not met

       Examples:
       - "Must adhere to policy X" -> Need user detail (system needs to verify adherence)
       - "Only economy flights allowed" -> Need user detail (system needs flight class)
       - "Should be used when X condition applies" -> Need user detail (system needs to verify condition)

    2. "Required Action"
       Use this label when the clause requires specific additional steps like:
       - Submit extra documentation
       - Get approvals
       - Complete forms
       - Provide comparisons

       Examples:
       - "Upload comparative quotes"
       - "Attach receipt copies"
       - "Get manager signature"

    3. "Keep In Mind"
       Use this label ONLY for:
       - Pure information with no requirements
       - Process descriptions that don't affect validity
       - Helpful context that doesn't create any rules

       Examples:
       - "Reimbursements are processed weekly"
       - "The university has preferred vendors"
       - "You can create a travel account"

    Section:
    {section_content}

    Key points:
    - If the clause uses words like "must", "required", "should", "only" - it's usually "Need user detail"
    - If it requires new documents/approvals or the user - it's "Required Action"
    - If it's purely informational with no requirements - it's "Keep In Mind"

    Analyze the policy section and return a JSON array where each element has two fields:
    - "text": The exact sentence or logically-connected group of sentences without any modifications, numbering, or formatting changes
    - "label": One of these three exact strings:
        - "Need user detail to determine if valid expense"
        - "Required Action"
        - "Keep In Mind"

    Return nothing by the JSON array, with no additional text what so ever.
    """

    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
    )

    output = response.choices[0].message.content

    # Find the JSON array in the response using regex
    array_match = re.search(r'\[\s*{.*}\s*\]', output, re.DOTALL)
    if array_match:
        try:
            return json.loads(array_match.group())
        except json.JSONDecodeError:
            print("Error: Found array-like structure but couldn't parse as JSON")
            return []
    else:
        print("Error: Couldn't find JSON array in response")
        return []

In [None]:
@dataclass
class PolicyNode:
    title: str
    raw_content: Optional[str] = None
    content_labels: Optional[List[Dict]] = None
    url: Optional[str] = None
    qualifying_questions: Optional[List[dict]] = None
    children: Dict[str, 'PolicyNode'] = field(default_factory=dict)

class PolicyTree:
    def __init__(self, policy_name: str = "Policy", organization: str = ""):
        self.root = PolicyNode(policy_name)
        self.organization = organization

    def add_policy_item(self, item: dict):
        """Add a policy item to the tree structure"""
        doc_title = item['document_title']
        section_path = item['section_title'].split(' > ') if item['section_title'] else []
        raw_content = item['content']
        url = item['url']

        # Get pre-processed data from the item
        qualifying_questions = item.get('qualifying_questions', None)
        content_labels = item.get('labels', None)  # Get existing labels if present

        # Start from the root node
        current_node = self.root

        # Add document level if it doesn't exist
        if doc_title not in current_node.children:
            current_node.children[doc_title] = PolicyNode(doc_title)

        # Move to document node
        current_node = current_node.children[doc_title]

        # If no section path, update document content
        if not section_path:
            current_node.raw_content = raw_content
            current_node.qualifying_questions = qualifying_questions
            current_node.url = url
            current_node.content_labels = content_labels  # Set existing labels
            return

        # Navigate/create the section path
        for section in section_path:
            if section not in current_node.children:
                current_node.children[section] = PolicyNode(section)
            current_node = current_node.children[section]

        # Update the content at final node
        current_node.raw_content = raw_content
        current_node.qualifying_questions = qualifying_questions
        current_node.url = url
        current_node.content_labels = content_labels  # Set existing labels

    def label_all_sections(self):
        """
        Find and label all sections in the tree that have content.
        """
        def traverse_and_label(node, path=[]):
            if node.raw_content and not node.content_labels:  # Only label if no labels exist
                path_str = " > ".join(path) if path else path[0]
                node.content_labels = label_section(self, path_str, self.organization)

            for child_title, child_node in node.children.items():
                traverse_and_label(child_node, path + [child_title] if path else [child_title])

        traverse_and_label(self.root)

    def load_from_jsonl(self, items: List[dict]):
        """Load multiple policy items"""
        for item in items:
            self.add_policy_item(item)

    def get_section(self, *path) -> Optional[PolicyNode]:
        """Get a specific section by path"""
        current = self.root
        for section in path:
            if section not in current.children:
                return None
            current = current.children[section]
        return current

    def get_children_at_level(self, *path) -> List[str]:
        """Get all subsection titles at a specific level"""
        node = self.get_section(*path)
        if node:
            return list(node.children.keys())
        return []

    def get_content(self, *path) -> Optional[str]:
        """
        Get content at a specific path

        Returns:
            - None if the path doesn't exist or is root
            - raw_content if path exists
        """
        if not path:  # Don't return content for root
            return None
        node = self.get_section(*path)
        return node.raw_content if node else None

    def get_url(self, *path) -> Optional[str]:
        """Get URL at a specific path"""
        node = self.get_section(*path)
        return node.url if node else None

    def get_labels(self, *path) -> Optional[List[Dict]]:
        """Get content labels for a specific path"""
        node = self.get_section(*path)
        return node.content_labels if node else None

    def generate_all_questions(self, question_generator_func):
        """
        Generate questions for all nodes in the tree, level by level.
        Parent questions are passed to children for context.
        """
        def format_questions_for_prompt(questions_list):
            """Format questions into the expected string format for the prompt"""
            formatted = []
            for q in questions_list:
                formatted.append(f"Q: {q['question']}")
                formatted.append(f"A: {q['answer']}")
            return "\n".join(formatted)

        def process_level(current_path=None):
            if current_path is None:
                current_path = []

            current_node = self.get_section(*current_path) if current_path else self.root
            children = self.get_children_at_level(*current_path)

            # Get accumulated questions from the path
            accumulated_questions = []
            path_cursor = self.root
            for section in current_path:
                path_cursor = path_cursor.children[section]
                if path_cursor.qualifying_questions:
                    accumulated_questions.extend(path_cursor.qualifying_questions)

            # Process each child
            for child in children:
                prev_questions_str = format_questions_for_prompt(accumulated_questions)

                # Generate questions for this child
                child_node = current_node.children[child]
                child_node.qualifying_questions = question_generator_func(
                    child,
                    prev_questions_str,
                    self.organization
                )

                # Recursively process this child's children
                process_level(current_path + [child])

        # Start processing from root
        process_level()

    def get_questions(self, *path) -> Optional[List[dict]]:
        """Get qualifying questions for a specific node"""
        node = self.get_section(*path)
        return node.qualifying_questions if node else None

    def get_questions_with_context(self, *path) -> List[dict]:
        """Get all questions up to and including this path"""
        questions = []
        current_path = []
        for section in path:
            current_path.append(section)
            node = self.get_section(*current_path)
            if node and node.qualifying_questions:
                questions.extend(node.qualifying_questions)
        return questions

    def to_jsonl(self, output_file: str) -> None:
        """Save the policy tree to a JSONL file."""
        def node_to_dict(node: PolicyNode, doc_title: str, section_path: List[str]) -> dict:
            """Convert a node to a dictionary format"""
            return {
                'document_title': doc_title,
                'section_title': ' > '.join(section_path) if section_path else '',
                'content': node.raw_content or '',
                'labels': node.content_labels or [],
                'url': node.url or '',
                'qualifying_questions': node.qualifying_questions or []
            }

        def traverse_tree(node: PolicyNode, doc_title: str = "", path: List[str] = None) -> List[dict]:
            """Traverse the tree and collect all nodes"""
            if path is None:
                path = []

            items = []

            if not doc_title:
                for child_title, child_node in node.children.items():
                    items.extend(traverse_tree(child_node, child_title))
            else:
                if node.raw_content or node.qualifying_questions:
                    items.append(node_to_dict(node, doc_title, path))

                for child_title, child_node in node.children.items():
                    items.extend(traverse_tree(child_node, doc_title, path + [child_title]))

            return items

        items = traverse_tree(self.root)

        with open(output_file, 'w', encoding='utf-8') as f:
            for item in items:
                f.write(json.dumps(item) + '\n')

In [None]:
def clean_markdown(markdown):
    """
    Break policy sections into logically coherent clauses.
    """
    prompt = f"""
    # Instructions
    You are given markdown for an expense auditing report. The text for the bullet points is taken verbatim from the policy.
    Remove text from the bullet points to make the language more natural in its isolated bullet point form.

    Example:
    "In this context, you should submit an explanation document" -> "You should submit an explanation document"

    Markdown:
    {markdown}

    Do not edit the text in any other way.
    Respond only with the edited markdown, nothing more.
    """

    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
    )

    text = response.choices[0].message.content

    return text

# SECTION 4: Retrieval functions

In [None]:
def extract_titles_subtitles(policy_jsonl_data):
    extracted_titles_subtitles = []
    i = 0
    for record in policy_jsonl_data:
        # Remove the 'content' key from the record
        filtered_record = {key: value for key, value in record.items() if key != 'content'}
        # Remove the 'url' key from the record
        url_filtered_record = {key: value for key, value in filtered_record.items() if key != 'url'}
        extracted_titles_subtitles.append(url_filtered_record)
        i += 1
    return extracted_titles_subtitles

In [None]:
# 1-step prompt-based section retrieval based on receipt data
def filter_relevant_records(extracted_titles_subtitles, receipt_data, status, organization, location, date):
  prompt = f"""
      System information: The user that provided this receipt is a {status} at {organization}, located in {location}, and the current date is {date}.
      # Instructions\n
      You are tasked with identifying the relevant sections of an organization's expense policy based on the provided receipt data. Use the following guidelines to filter the records:\n
      1. Analyze the **receipt data** to then compare it with the **titles and section subtitles** of the policy records.
      2. Select records that explicitly mention or relate to the details in the receipt, such as:
        - Expense category (e.g., meals, lodging, transportation, airfare, gifts, etc.).
        - Policy sections that provide relevant guidance for the receipt type or scenario.
      3. Ignore records that do not contain information relevant to the receipt's purpose or type. If you are not certain about whether a record is relevant, include it.
      4. You must only return the filtered records in the same JSON format.\n\n
      # Receipt Data\n
      {receipt_data}
      # Policy Document, Section Titles and Questions\n
      {extracted_titles_subtitles}\n
      The output must only contain the filtered list of json records of document and section titles.
      JSON:
      """

  response = client.chat.completions.create(
      model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
      messages=[{"role": "user", "content": prompt}],
  )

  filtered_extracted_titles_subtitles = response.choices[0].message.content
  return filtered_extracted_titles_subtitles

In [None]:
# 2-step prompt-based section retrieval based on receipt data
## 1st step: filter relevant documents given their sections and content summaries
def filter_relevant_documents_by_batch(batch_of_document, receipt_data, status, organization, location, date):
  prompt = f"""
    # Instructions
    You are tasked with determining whether a given policy document is useful for validating compliance of an expense. To make this determination, consider both the provided receipt data and user information, and compare them against the document’s title and its section subtitles. Each subtitle indicates the type of policies contained within that section of the document.

    Follow these guidelines:

    1. **Analysis of Input:**
      - Examine the **Receipt Data** and **User Information** carefully.
      - Compare these details to the **Policy Document and Section Titles** you are given.

    2. **Criteria for Usefulness:**
      - If any of the sections within the document might apply to the expense scenario (e.g., relevant expense category, applicable user context, or timeframe), then the document should be considered useful.
      - Answer '<YES>' if you find at least one relevant match between the policy content (as implied by the document and section titles) and the receipt details.
      - Answer '<NO>' if none of the information in the document appears relevant. If in doubt, favor '<NO>'.

    3. **Justification:**
      - Along with the '<YES>' or '<NO>' answer, provide a concise rationale explaining the reasoning behind your decision.
      - The reasoning should clarify which elements of the receipt or user profile influenced your decision and how they relate (or do not relate) to the policy document and its section titles.

    4. **Formatting the Output:**
      - Do not produce code.
      - Provide two lines of output:
        1. The first line should be either '<YES>' or '<NO>'.
        2. The second line should be a brief explanation of why you chose that answer, referencing relevant parts of the receipt and policy document.

    # Receipt Data
    {receipt_data}

    # User Information
    The user that provided this receipt is a {status} at {organization}, located in {location}, and the current date is {date}.

    # Policy Document and Section Titles
    {batch_of_document}

    # Final Answer:
    <YES or NO> Explanation of the reasoning...
    """

  response = client.chat.completions.create(
      model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
      messages=[{"role": "user", "content": prompt}],
  )

  filtered_extracted_titles_subtitles = response.choices[0].message.content
  return filtered_extracted_titles_subtitles

## 2nd step: filter relevant sections withing a document given their content summary
def filter_relevant_sections_by_batch(batch_of_sections, receipt_data, status, organization, location, date):
  prompt = f"""
    # Instructions
    You are tasked with identifying the relevant sections of an organization's expense policy based on the given receipt data, user information, and policy section titles with their summaries. Your goal is to decide for each policy section whether it should be included for further audit review or not. Follow these guidelines:

    1. **Analysis of Input:**
      - Consider the **Receipt data** and **User information** as the primary reference.
      - Compare these details with the **Policy Document Titles, Section Titles, and Summaries**.

    2. **Criteria for Inclusion:**
      - Include sections that directly relate to the expense type, category, or scenario indicated by the receipt data.
      - Consider aspects like expense category (meals, lodging, transportation, airfare, gifts), user’s role and affiliation, submission date, and any other contextual information from the user profile.
      - If uncertain about a section's relevance, err on the side of inclusion.

    3. **Exclusion Criteria:**
      - Exclude sections that are clearly unrelated to the receipt’s details (e.g., no mentioned expense category, irrelevant policy guidance).

    4. **Output Format Requirements:**
      - Do not produce code.
      - Return the results as a list of objects in valid JSON format.
      - Each object should contain:
        - 'document_title': The exact document title given.
        - 'section_title': The exact section title given.
        - 'reasoning': A brief explanation of why this section was included or excluded.
        - 'include': 'YES' or 'NO' depending on whether the section is considered relevant.

    5. **Formatting:**
      - The output must be enclosed in triple backticks as shown below.
      - Do not add extra commentary outside the JSON structure.

    # Receipt Data
    {receipt_data}

    # User Information
    The user that provided this receipt is a {status} at {organization}, located in {location}, and the current date is {date}.

    # Policy Document Titles, Section Titles and Summaries
    {batch_of_sections}

    # Required JSON output example:
    """ + """
    ```
    [{
        "document_title": "",
        "section_title": "",
        "reasoning": "",
        "include": ""
    },
    {
        "document_title": "",
        "section_title": "",
        "reasoning": "",
        "include": ""
    }
    ]
    ```
    # Final Answer:
    """

  response = client.chat.completions.create(
      model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
      messages=[{"role": "user", "content": prompt}],
  )

  filtered_extracted_titles_subtitles = response.choices[0].message.content
  return filtered_extracted_titles_subtitles

In [None]:
# Section retrieval with Ranking LLM (Llama-Rank-V1)
def rerank_relevant_records(policy_jsonl, receipt_data, status, organization, location, date):
    query = f"""
        System information: The user that provided this receipt is a {status} at {organization}, located in {location}, and the current date is {date}.
        # Instructions\n
        You are tasked with identifying the most relevant sections of an organization's expense policy based on the provided receipt data. We want to answer the question: What policies are applicable to this expense? Is this expense compliant or not?\n
        # Receipt Data\n
        {receipt_data}
        """

    documents = policy_jsonl

    response = client.rerank.create(
        model="Salesforce/Llama-Rank-V1",
        query=query,
        documents=documents,
        return_documents=True,
        rank_fields=["document_title", "section_title", "content"],
    )

    return response

In [None]:
# extract original policy content of filtered sections
def extract_relevant_blocks_content_url(filtered_extracted_titles_subtitles, original_records):
    """
    Filters the original records to return the content and URL of records
    that match the filtered titles and subtitles.

    Args:
        filtered_extracted_titles_subtitles (list): Filtered titles and subtitles.
        original_records (list): Original records containing content and URLs.

    Returns:
        list: A list of dictionaries with matching content and URL.
    """
    # Preprocess the original records into a dictionary for quick lookup
    record_map = {
        (record.get("document_title"), record.get("section_title")): {
            "document_title": record.get("document_title"),
            "section_title": record.get("section_title"),
            "content": record.get("content"),
            "url": record.get("url")
        }
        for record in original_records
    }

    # Filter the relevant blocks by looking up in the dictionary
    relevant_blocks = []
    for record in filtered_extracted_titles_subtitles:
        key = (record.get("document_title"), record.get("section_title"))
        if key in record_map:
            relevant_blocks.append(record_map[key])

    return relevant_blocks

In [None]:
# Extract json type from LLM output string
def extract_json_from_llm_output(llm_output):
    json_match = re.search(r"```(.*?)```", llm_output, re.DOTALL)
    if json_match:
        json_text = json_match.group(1).strip()  # Extract the JSON text and remove extra whitespace
        try:
            # Parse the JSON text into a Python list
            json_data = json.loads(json_text)
            return json_data
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
    else:
        print("No JSON content found between backticks.")

# SECTION 5: Evaluation Funtions

In [None]:
def save_test_to_drive(test, path):
    try:
        # Check if the file exists in Google Drive
        try:
            with open(path, 'r') as file:
                # Load existing data from the file
                data = json.load(file)
        except FileNotFoundError:
            # Initialize an empty list if the file doesn't exist
            data = []

        # Append the new test to the list
        data.append(test)

        # Save the updated data back to the file in Google Drive
        with open(path, 'w') as file:
            json.dump(data, file, indent=4)

        print(f"Test saved to {path}: {test}")
    except Exception as e:
        print(f"Error saving test: {e}")

In [None]:
def check_clauses_with_LLM(clauses, receipt, organization):
    """
    Batch evaluate multiple policy clauses against a receipt.

    Args:
        clauses (list): List of clauses to evaluate
        receipt (dict): Receipt information
        organization (str): Organization name

    Returns:
        list: List of evaluation results
    """
    prompt = f"""
    # Instructions
    You are helping with expense auditing for {organization}.
    You need to evaluate if a receipt meets several policy requirements. Not all requriements are applicable to the expense type or category.
    For each requirement, determine:
    - 'yes' if the receipt directly meets it
    - 'no' if the receipt directly does not meet it
    - 'unclear' if you're not certain or missing information
    - 'not applicable' if the requirement is not applicable to the expense type or category

    Analyze this receipt against the following requirements.
    Receipt:
    {receipt}

    Requirements:
    {json.dumps([clause['text'] for clause in clauses], indent=2)}

    Return a JSON array with an object for each applicable requirement. Each object should have:
    - 'verdict': exactly 'yes', 'no', 'unclear', or 'not applicable'
    - 'explanation': brief explanation in 5-10 words

    Return only the JSON array, no other text.
    """

    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
    )

    # Extract and parse JSON array
    output = response.choices[0].message.content
    array_match = re.search(r'\[\s*{.*}\s*\]', output, re.DOTALL)
    if array_match:
        try:
            return json.loads(array_match.group())
        except json.JSONDecodeError:
            print("Error: Found array-like structure but couldn't parse as JSON")
            return []
    else:
        print("Error: Couldn't find JSON array in response")
        return []

In [None]:
def process_multiple_sections(policy_tree, section_paths, organization, receipt):
    """Process multiple policy sections and combine them into a single report."""
    print("\n🔍 Starting Policy Analysis...\n")

    all_items = {
        "Need user detail to determine if valid expense": [],
        "Required Action": [],
        "Keep In Mind": []
    }
    all_evaluations = []

    # Process each section
    for section_path in section_paths:
        print(f"📋 Analyzing section: {section_path}")
        path_parts = [part.strip() for part in section_path.split(">")]
        section_labels = policy_tree.get_labels(*path_parts)
        # Filter clauses needing evaluation
        need_eval_clauses = [item for item in section_labels
                           if item['label'] == "Need user detail to determine if valid expense"]

        if need_eval_clauses:
            print("   Evaluating clauses against receipt...")
            evaluations = check_clauses_with_LLM(need_eval_clauses, receipt, organization)

            # Match evaluations back to clauses
            for clause, evaluation in zip(need_eval_clauses, evaluations):
                clause['ai_evaluation'] = evaluation['verdict']
                clause['ai_explanation'] = evaluation['explanation']
                all_evaluations.append(evaluation['verdict'].lower())

        # Process all items including those that don't need evaluation
        for item in section_labels:
            if item['label'] != "Need user detail to determine if valid expense":
                item['ai_evaluation'] = None
                item['ai_explanation'] = None
            item['section_path'] = section_path
            all_items[item['label']].append(item)

        print("   ✓ Section analysis complete\n")

    # Rest of the function remains the same...
    print("📊 Determining overall evaluation...")
    overall_eval = None
    if all_evaluations:
        overall_eval = "Yes"
        if "no" in all_evaluations:
            overall_eval = "No"
        elif "unclear" in all_evaluations or "maybe" in all_evaluations:
            overall_eval = "Unclear"

    print("📝 Formatting final report...")
    markdown = []
    markdown.append(f"#Expense Analysis Report")
    for label in ["Need user detail to determine if valid expense",
                 "Required Action",
                 "Keep In Mind"]:
        if all_items[label]:
            if label == "Need user detail to determine if valid expense":
                markdown.append(f"##Requirements")
            else:
                markdown.append(f"##{label}")

            current_section = None
            for item in sorted(all_items[label], key=lambda x: x['section_path']):
                path_parts = [part.strip() for part in item['section_path'].split(">")]
                url = policy_tree.get_url(*path_parts)

                if item['section_path'] != current_section:
                    if url:
                        markdown.append(f"\n###[{path_parts[-1]}]({url})")
                    else:
                        markdown.append(f"\n###{path_parts[-1]}")
                    current_section = item['section_path']

                markdown.append(f"- {item['text']}")
                if item['ai_evaluation'] is not None:
                    markdown.append(f"  - AI Evaluation: {item['ai_evaluation']}")
                    markdown.append(f"  - Why? {item['ai_explanation']}")

            if label == "Need user detail to determine if valid expense" and overall_eval:
                markdown.append(f"\n**Overall AI Evaluation: {overall_eval}**")
            markdown.append("")

    combined_markdown = "\n".join(markdown)
    clean_combined_markdown = clean_markdown(combined_markdown)

    print("\n✅ Analysis complete!\n")
    return clean_combined_markdown

In [None]:
def generate_report(policy_tree, receipt_info, organization, filtered_sections):
  final_report = process_multiple_sections(policy_tree, filtered_sections, organization, receipt_info)
  display(Markdown(final_report))

In [None]:
def fiter_and_retrieve_documents(data, receipt_info, status, organization, location, date):
    extracted_titles_subtitles = extract_titles_subtitles(data)

    # Group records by 'document_title'
    grouped_records = defaultdict(list)

    for record in extracted_titles_subtitles:
        grouped_records[record['document_title']].append(record)

    # Convert to a list of grouped records
    batches = list(grouped_records.values())

    filtered_batches = []
    print("🔎 Searching Policy Corpus for relevant documents...\n   Relevant documents:\n")
    for batch in batches:
      response = filter_relevant_documents_by_batch(batch, receipt_info, status, organization,location, date)
      #print(response)
      if "YES" in response or "Yes" in response or "yes" in response:
          filtered_batches.append(batch)
          print("   - " + batch[0]['document_title'])

    return filtered_batches

In [None]:
"""
def fiter_and_retrieve_sections(filtered_batches, receipt_info, status, organization, location, date):
    filtered_sections = []

    for batch in filtered_batches:
        print(f"🔎 Searching '{batch[0]['document_title']}' for relevant sections...\n   Relevant sections:\n")
        while
        response = filter_relevant_sections_by_batch(batch, receipt_info, status, organization,location, date)
        print(response)
        marked_sections_json = extract_json_from_llm_output(response)
        for section in marked_sections_json:
            if section['include'] == 'yes' or section['include'] == 'Yes' or section['include'] == 'YES':
                filtered_sections.append(section)
                print("   - " + section['section_title'])

    final_sections = []
    for section in filtered_sections:
        final_sections.append(section['document_title'] + " > " + section['section_title'])

    return final_sections
"""
def fiter_and_retrieve_sections(filtered_batches, receipt_info, status, organization, location, date, chunk_size=20):
    filtered_sections = []

    for batch in filtered_batches:
        # Instead of calling the function on the entire batch at once, break it into chunks
        print(f"🔎 Searching '{batch[0]['document_title']}' for relevant sections...\n   Relevant sections:\n")
        chunked_responses = []
        for i in range(0, len(batch), chunk_size):
            sub_batch = batch[i:i+chunk_size]
            #print(f"🔎 Searching '{sub_batch[0]['document_title']}' for relevant sections (chunk {i//chunk_size + 1})...\n   Relevant sections:\n")
            response = filter_relevant_sections_by_batch(sub_batch, receipt_info, status, organization, location, date)
            #print(response)

            # Extract and append results from this chunk
            marked_sections_json = extract_json_from_llm_output(response)
            for section in marked_sections_json:
                # Normalize the include field to uppercase
                include_value = section.get('include', '').strip().upper()
                if include_value == 'YES':
                    filtered_sections.append(section)
                    print("   - " + section['section_title'])

    # After processing all batches and sub-batches, collect final sections
    final_sections = [
        sec['document_title'] + (" > " + sec['section_title'] if sec['section_title'].strip() else "")
        for sec in filtered_sections
    ]
    return final_sections

# SECTION 6: [Do not run] Pre Processing Execution

In [None]:
"""
# Iterate over records and add summaries
for record in policy_jsonl_data:
    record['summary'] = summarize_content(record['content'])

# Process records to replace content with structured clauses
for record in policy_jsonl_data:
    record['content'] = analyze_content_to_clauses(record['content'])

print(policy_jsonl_data[0])

# Specify the path where you want to save the file in your Google Drive
file_path = '/content/drive/My Drive/Stanford_Expenses_V1_Pre_Processed.json'

# Save the JSON data to the file
with open(file_path, 'w') as json_file:
    json.dump(policy_jsonl_data, json_file, indent=4)
"""

NameError: name 'policy_jsonl_data' is not defined

# SECTION 7: Main Execution

# [Action item] Run with desired parameters and local file paths

In [None]:
organization = "Stanford University"
user_status = "Student" # Student, Faculty or Staff
organization_location = "459 Lagunita Dr, Stanford CA 94305"
date = "12/07/2024" # Today's date
policy_summaries_file_path = '/Stanford_Expenses_V1_Pre_Processed.json'
policy_tree_file_path = '/Stanford Policy With Labeled Sections V3.jsonl'
receipt_pdf_file_path = '/Uber_Receipt.pdf' # insert receipt pdf file path

# Load the preprocessed JSON Policy with summaries
with open(policy_summaries_file_path, 'r') as json_file:
    policy_jsonl_data = json.load(json_file)

# Load pre-processed labeled policies to policy tree
with open(policy_tree_file_path, 'r') as f:
    items = [json.loads(line) for line in f]
policy_tree = PolicyTree("Expense Policy", "Stanford University")
policy_tree.load_from_jsonl(items)

# Extract json receipt data from receipt pdf
receipt_info = extract_receipt_info(receipt_pdf_file_path)

# Retrieve relevant sections
extracted_titles_subtitles_summaries = extract_titles_subtitles(policy_jsonl_data)
filtered_documents = fiter_and_retrieve_documents(extracted_titles_subtitles_summaries, receipt_info, user_status, organization, organization_location, date)
filtered_sections = fiter_and_retrieve_sections(filtered_documents, receipt_info, user_status, organization, organization_location, date)

# Final Evaluation and Report
generate_report(policy_tree, receipt_info, organization, filtered_sections)