### Process SEC filing data

#### Step 1: Populate cik_ticker and submissions collections 
in company_eval db of mongodb for the 8 tickers specified. Submissions collection (table) only holds the dates of the submissions and not the actual filing data. 

In [2]:
# Import the necessary functions from edgar_utils.py
from db.edgar_utils import (
    download_cik_ticker_map,
    cik_from_ticker,
    download_all_cik_submissions,
)
ticker = 'nmr'
cik = cik_from_ticker(ticker)
print(f'CIK for {ticker} is {cik}')

CIK for nmr is 0001163653


In [1]:
# Import the necessary functions from edgar_utils.py
from db.edgar_utils import download_cik_ticker_map, cik_from_ticker, download_all_cik_submissions

# Define the tickers for the companies you're interested in
tickers = ["AAPL",  "META", "MSFT", "GOOG", "AMZN", "NVDA", "TSLA" ]
# tickers = ["NMR"]

# Download the CIK-ticker map
download_cik_ticker_map()

# For each ticker, get the CIK and download all submissions
for ticker in tickers:
    cik = cik_from_ticker(ticker)
    download_all_cik_submissions(cik)

print("All done!")

All done!


#### Step 2: Get the latest filing documents and financial data for a given company

In [2]:

from db.edgar_utils import download_latest_filings, download_financial_data, cik_from_ticker

# Define the tickers for the companies you're interested in
tickers = ["AAPL", "ATKR", "META", "MSFT", "GOOG", "AMZN", "NVDA", "TSLA"]
# tickers = [ "NMR"]

for ticker in tickers:
    filings = download_latest_filings(ticker)
    
    # Now get the financial data for the company
    cik = cik_from_ticker(ticker)
    download_financial_data(cik)
    
    # print(filings)
    
print("All done!")

Document already exists: https://www.sec.gov/Archives/edgar/data/0000320193/000032019324000123/aapl-20240928.htm
Document already exists: https://www.sec.gov/Archives/edgar/data/0000320193/000032019325000008/aapl-20241228.htm
submissions file not found in mongodb for 0001666138
2025-01-30 (10-K): https://www.sec.gov/Archives/edgar/data/0001326801/000132680125000017/meta-20241231.htm
10-K downloaded successfully
Closest '10-K' document downloaded successfully.
2024-07-30 (10-K): https://www.sec.gov/Archives/edgar/data/0000789019/000095017024087843/msft-20240630.htm
10-K downloaded successfully
Closest '10-K' document downloaded successfully.
2025-01-29 (10-Q): https://www.sec.gov/Archives/edgar/data/0000789019/000095017025010491/msft-20241231.htm
10-Q downloaded successfully
Closest '10-Q' document downloaded successfully.
2025-02-05 (10-K): https://www.sec.gov/Archives/edgar/data/0001652044/000165204425000014/goog-20241231.htm
10-K downloaded successfully
Closest '10-K' document downlo

#### Step 3: Parse Document

    This gets documents from documents collection and parses it. Parsing stores each sections oject with section titile and text.

In [3]:
from db.edgar_utils import cik_from_ticker
from db.utils import parse_latest_filings

tickers = ["AAPL", "ATKR", "META", "MSFT", "GOOG", "AMZN", "NVDA", "TSLA"]
# tickers = ["CUK"]

for ticker in tickers:
    try:
        print(f"Starting to parse filings for {ticker}...")
        cik = cik_from_ticker(ticker)
        if cik:  # Check if a CIK was successfully retrieved
            parse_latest_filings(cik)
            print(f"Done parsing filings for {ticker}")
        else:
            print(f"CIK not found for {ticker}")
    except Exception as e:
        print(f"Failed to parse filings for {ticker}. Error: {e}") 

Starting to parse filings for AAPL...
Done parsing filings for AAPL
Starting to parse filings for ATKR...
No 10-K document found for cik 0001666138
Done parsing filings for ATKR
Starting to parse filings for META...
form type: 		10-K
cik                   0001326801
name        Meta Platforms, Inc.
ticker                      META
exchange                  Nasdaq
Name: 5, dtype: object
No 10-Q document found for cik 0001326801 after the 10-K document
Done parsing filings for META
Starting to parse filings for MSFT...
form type: 		10-K
cik             0000789019
name        MICROSOFT CORP
ticker                MSFT
exchange            Nasdaq
Name: 2, dtype: object
form type: 		10-Q
cik             0000789019
name        MICROSOFT CORP
ticker                MSFT
exchange            Nasdaq
Name: 2, dtype: object
Done parsing filings for MSFT
Starting to parse filings for GOOG...
form type: 		10-K
cik            0001652044
name        Alphabet Inc.
ticker              GOOGL
exchange       

#### Step 3 Alternative way for Parsing documents

In [None]:
from db.edgar_utils import cik_from_ticker
from db.utils import parse_document
from pymongo import MongoClient

# Create a MongoClient to the running MongoDB instance
client = MongoClient('localhost', 27017)
# Access your database
db = client['company_eval']

tickers = ["AAPL", "BABA", "ATKR", "META", "MSFT", "GOOG", "AMZN", "NVDA"]
filing_types = ["10-K", "10-Q", "8-K"]

for ticker in tickers:
	cik = cik_from_ticker(ticker)

	for filing_type in filing_types:
		# Get all documents for a given cik and filing type, sorted by filing date in descending order
		query = {"cik": cik, "form_type": filing_type}
		doc_cursor = db["documents"].find(
			query,
			{"_id": 1, "form_type": 1, "filing_date": 1, "cik": 1, "html": 1}
		).sort("filing_date", -1)

		# Efficiently check if any documents were found
		first_doc = next(doc_cursor, None)
		if first_doc:
			# Since we've already fetched the first document, process it
			print(f"Parsing {filing_type} document for ticker {ticker}")
			parse_document(first_doc)
			# Iterate over the rest of the documents and parse them
			for doc in doc_cursor:
				print(f"Parsing {filing_type} document for ticker {ticker}")
				parse_document(doc)
		else:
			print(f"No {filing_type} document found for ticker {ticker}")

print("Processing completed for all tickers and filing types.")

#### Step 4: Call Risk Memo Tool to generate Risk Memo
This will generate risk memo in pdf format in save it generated_files folder in the project root

In [None]:
from ai.tools.risk_memo_tool import RiskMemoTool

risk_memo_tool = RiskMemoTool()
response = risk_memo_tool.generate_risk_memo("AAPL")

#### Get named section from parsed document using ticker in _id

In [None]:
import json
from db.utils import get_section_text

# Example usage
ticker ="AAPL"
section = "risk factors"
response_json = get_section_text(ticker, section)

# Convert the JSON string back to a dictionary
response_dict = json.loads(response_json)

# Accessing the newly formatted response
print(f"Ticker: {response_dict['ticker']}")
print(f"Section Name: {response_dict['section_name']}")
if response_dict['section_text']:
    print(f"Text for section '{response_dict['section_name']}':\n{response_dict['section_text']}\n")
else:
    print("No section text found.\n")
    

### Access MongoDB to check
The following code will show you contents of a collection (table) in the MongoDB database.

In [None]:
# # Define DB connection
# from pymongo import MongoClient
# # Create a MongoClient to the running MongoDB instance
# client = MongoClient('localhost', 27017)
# # Access your database
# db = client['company_eval']

# # Access the 'submissions' collection
# submissions = db['submissions']

# # Query all documents in the collection
# # for doc in submissions.find().limit(8):
# #     print(doc)

# # find fields in documents collection
# # doc = db["documents"].find_one()
# # print(doc.keys())


# gets a url for the latest 10-K form for a company
# docs = db["documents"].find({"_id": {"$regex": "nvda"}, "form_type": "10-K"}, {"_id": 1}).sort("_id", -1).limit(1)
# for doc in docs:
#     print(doc)

# gets the section text for a document
# doc = db["parsed_documents"].find_one({"form_type":"10-K"})
# if doc and 'sections' in doc:
#     for section, text in doc['sections'].items():
#         print(f"Section: {section}")
#         print(f"Text: {text}")

from db.edgar_utils import company_from_ticker


ticker = "aapl"
company = company_from_ticker(ticker)
print(company)

#### Test sec_tool

In [None]:
# Import necessary modules
from ai.tools.sec import SecTool  
import asyncio

# Initialize the SecTool object
sec_tool = SecTool()

# Define an async function to call the method
async def get_section():
    # Call the get_filing_section_in_kb method with "AAPL" and "Risk Factors" and await its result
    result = await sec_tool.get_filing_section_in_kb("AAPL", "risk factors")
    return result

# Run the async function and get the result
result = await get_section()
print(result)


####  Get all sections from risk_memos

In [None]:
from db.utils import get_all_sections_from_risk_memo

sections = get_all_sections_from_risk_memo("AAPL")

print(sections)

#### Test process_sections (includes db inserts_)

In [None]:
from ai.tools.process_risk_memo import process_sections



respoonse = process_sections("AAPL")

#### Test get sections from risk memo
Once we get ALL the available sections then these sections together as one are sent to Assistant. 
Assistant then generates final draft of the risk memo as a single document.
This document is then saved to the MOngodb - risk_memos under section risk_memo_final_draft. 

In [None]:
from db.utils import (
    get_all_sections_from_risk_memo, 
    get_field_from_risk_memo, 
    add_section_to_risk_memo,
    get_section_text
    )
    
# from ai.tools.process_risk_memo import RiskMemoGenerator

# risk_memo_generator = RiskMemoGenerator()

# from ai.risk_memo_drafter import memo_draft_assistant

ticker = "AAPL"
# Get section text
section_text = get_section_text(ticker, "profile")
print(section_text)

# Get all summerised sections from the risk memo
# risk_memo_sections = get_all_sections_from_risk_memo(ticker)
# print(risk_memo_sections)

# company = get_field_from_risk_memo(ticker, "companyName")
# print(company)

# # Generate the final draft of the risk memo
# draft_memo = risk_memo_generator.draft_risk_memo(risk_memo_sections)
# print(draft_memo)


# # Save the final draft in the risk_memos collection of mongoDB
# add_section_response = add_section_to_risk_memo(ticker, ticker, "risk_memo_final_draft", draft_memo)

# print(add_section_response)


In [None]:
from db.utils import add_section_to_risk_memo


# Test data
ticker = "AAPL"
section_name = "financials"
text = '{"text": "This is the financial section content."}'

# Call the function with test data
result = add_section_to_risk_memo(ticker, section_name, text)

print(result)

#### Process Sections (LLm call)

In [None]:
from process_risk_memo import  process_sections

response = process_sections("TSLA")

#### Test the final PDF generation and its various functions

#### Test Create PDF

In [None]:
from ai.tools.risk_memo_pdf_generator import RiskMemoGenerator

generator = RiskMemoGenerator()
file_name = generator.create_pdf("MSFT")

print(file_name)

#### Test Market Data

In [None]:
from market_data import get_company_profile, get_financial_overview

# response = get_company_profile("AAPL")

response = get_financial_overview("MSFT")

print(response)

#### Test Google Document AI

In [8]:
# Import necessary libraries
import os
from google.cloud import documentai_v1beta3 as documentai

# ====== Configuration ======

# Path to your service account key JSON file
service_key_path = "/home/rhythm/Documents/Code/regenai/backend/credentials/regenai-service-key.json"

# Replace the following placeholders with your actual values
project_id = "regenai"                       # Your Google Cloud project ID
location = "eu"                              # Processor location, e.g., "us", "eu"
processor_id = "41265319209e4d75"            # Your Document AI processor ID
processor_version_id = None                  # Optional: Specify if using a specific processor version
pdf_file_path = "vtest8.pdf"                 # Path to the PDF file you want to process

# =============================

# Step 1: Set the environment variable for Google Application Credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = service_key_path

# Step 2: Verify that the credentials file exists
if not os.path.isfile(service_key_path):
    raise FileNotFoundError(f"Credentials file not found at: {service_key_path}")

# Step 3: Initialize the Document AI client
client_options = {"api_endpoint": f"{location}-documentai.googleapis.com"}
client = documentai.DocumentProcessorServiceClient(client_options=client_options)

# Step 4: Define the processor name
if processor_version_id:
    # Full resource name of the processor version
    processor_name = client.processor_version_path(project_id, location, processor_id, processor_version_id)
else:
    # Full resource name of the processor
    processor_name = client.processor_path(project_id, location, processor_id)

# Step 5: Verify that the PDF file exists
if not os.path.isfile(pdf_file_path):
    raise FileNotFoundError(f"PDF file not found at: {pdf_file_path}")

# Step 6: Read the PDF file into memory
with open(pdf_file_path, "rb") as file:
    pdf_content = file.read()

# Step 7: Create a RawDocument
raw_document = documentai.RawDocument(content=pdf_content, mime_type="application/pdf")

# Step 8: Create the process request
request = documentai.ProcessRequest(
    name=processor_name,
    raw_document=raw_document,
)

# Step 9: Process the document
result = client.process_document(request=request)

# Step 10: Retrieve the document object
document = result.document

# # Helper function to extract text from layout
# def layout_to_text(layout, text):
#     """Extracts text from the document based on the layout."""
#     if not layout.text_anchor.text_segments:
#         return ""
#     text_chunks = []
#     for segment in layout.text_anchor.text_segments:
#         start_index = segment.start_index if segment.start_index is not None else 0
#         end_index = segment.end_index if segment.end_index is not None else len(text)
#         text_chunks.append(text[start_index:end_index])
#     return ''.join(text_chunks)

def layout_to_text(layout, text):
    """
    Extracts text from the document based on the layout, handling multiple text segments.
    """
    if not layout.text_anchor.text_segments:
        return ""
    # Concatenate all text segments for the layout
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )

# ====== Extract and Print Form Fields ======

print("\nExtracting form fields:")
for page in document.pages:
    if not page.form_fields:
        continue  # Skip pages without form fields
    print(f"\nFound {len(page.form_fields)} form field(s) on page {page.page_number}:")
    for field in page.form_fields:
        name = layout_to_text(field.field_name, document.text)
        value = layout_to_text(field.field_value, document.text)
        print(f"    * {repr(name.strip())}: {repr(value.strip())}")

# ====== Extract and Print Entities ======

print("\nExtracting entities:")
if not document.entities:
    print("No entities found in the document.")
else:
    for entity in document.entities:
        key = entity.type_
        text_value = layout_to_text(entity, document.text)
        confidence = entity.confidence
        normalized_value = entity.normalized_value.text if entity.normalized_value else ""
        print(f"    * {repr(key)}: {repr(text_value.strip())} ({confidence:.1%} confident)")
        if normalized_value:
            print(f"      Normalized Value: {repr(normalized_value)}")


Extracting form fields:

Found 6 form field(s) on page 1:
    * 'Effective Date:': '6/1/2023'
    * '4) How many years has Assured been in business:': '6'
    * 'Diving from shore/fixed objects in navigable waters:': '20 %'
    * 'Diving in non-navigable waters:': '0 %'
    * 'Part Time': '_2_'
    * 'Submission 8': '8 Street Rd\nLong Beach, CA 90808'

Found 3 form field(s) on page 3:
    * 'Mixed gas diving:': '39 (nitrox)%'
    * 'Shallow air diving:': '60%'
    * "Deep air diving (over 130' fsw):": '1%'

Found 1 form field(s) on page 4:
    * 'Diver/\nDive Tender/\nDive Supervisor\nPayroll': '$30,000'

Extracting entities:
    * 'generic_entities': '' (0.0% confident)
    * 'generic_entities': '' (0.0% confident)
    * 'generic_entities': '' (0.0% confident)


In [2]:
# Import necessary libraries
import os
import json
from google.cloud import documentai_v1beta3 as documentai
from google.protobuf.json_format import MessageToDict

# ====== Configuration ======

# Path to your service account key JSON file
service_key_path = "/home/rhythm/Documents/Code/regenai/backend/credentials/regenai-service-key.json"

# Replace the following placeholders with your actual values
project_id = "regenai"                       # Your Google Cloud project ID
location = "eu"                              # Processor location, e.g., "us", "eu"
processor_id = "41265319209e4d75"            # Your Document AI processor ID
processor_version_id = None                  # Optional: Specify if using a specific processor version
pdf_file_path = "vtest2b.pdf"                 # Path to the PDF file you want to process

# =============================

# Set the environment variable for Google Application Credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = service_key_path

# Verify that the credentials file exists
if not os.path.isfile(service_key_path):
    raise FileNotFoundError(f"Credentials file not found at: {service_key_path}")

# Initialize the Document AI client
client_options = {"api_endpoint": f"{location}-documentai.googleapis.com"}
client = documentai.DocumentProcessorServiceClient(client_options=client_options)

# Define the processor name
if processor_version_id:
    # Full resource name of the processor version
    processor_name = client.processor_version_path(project_id, location, processor_id, processor_version_id)
else:
    # Full resource name of the processor
    processor_name = client.processor_path(project_id, location, processor_id)

# Verify that the PDF file exists
if not os.path.isfile(pdf_file_path):
    raise FileNotFoundError(f"PDF file not found at: {pdf_file_path}")

# Read the PDF file into memory
with open(pdf_file_path, "rb") as file:
    pdf_content = file.read()

# Create a RawDocument
raw_document = documentai.RawDocument(content=pdf_content, mime_type="application/pdf")

# Create the process request
request = documentai.ProcessRequest(
    name=processor_name,
    raw_document=raw_document,
)

# Process the document
result = client.process_document(request=request)

# Retrieve the document object
document = result.document

# Helper function to extract text from layout
def layout_to_text(layout, text):
    """
    Extracts text from the document based on the layout, handling multiple text segments.
    """
    if not layout.text_anchor.text_segments:
        return ""
    # Concatenate all text segments for the layout
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )

# Initialize the dictionary to hold the extracted data
extracted_data = {
    "form_fields": {},
    "entities": {},
    "tables": [],
    "text": ""
}

# Extract Form Fields
for page in document.pages:
    for field in page.form_fields:
        field_name = layout_to_text(field.field_name, document.text).strip()
        field_value = layout_to_text(field.field_value, document.text).strip()
        extracted_data["form_fields"][field_name] = field_value

# Extract Entities
for entity in document.entities:
    entity_type = entity.type_.strip()
    entity_text = layout_to_text(entity, document.text).strip()
    # Handle multiple entities of the same type
    if entity_type in extracted_data["entities"]:
        if isinstance(extracted_data["entities"][entity_type], list):
            extracted_data["entities"][entity_type].append(entity_text)
        else:
            extracted_data["entities"][entity_type] = [extracted_data["entities"][entity_type], entity_text]
    else:
        extracted_data["entities"][entity_type] = entity_text

# Extract Tables
for page in document.pages:
    for table in page.tables:
        table_data = []
        # Extract header cells
        headers = []
        if table.header_rows:
            for header_cell in table.header_rows[0].cells:
                header_text = layout_to_text(header_cell.layout, document.text).strip()
                headers.append(header_text)
        # Extract body cells
        for row in table.body_rows:
            row_data = {}
            for idx, cell in enumerate(row.cells):
                cell_text = layout_to_text(cell.layout, document.text).strip()
                if headers and idx < len(headers):
                    row_data[headers[idx]] = cell_text
                else:
                    row_data[f"column_{idx+1}"] = cell_text
            table_data.append(row_data)
        extracted_data["tables"].append(table_data)

# Extract Full Text
extracted_data["text"] = document.text

# Optionally, save extracted data to JSON file
with open('extracted_data.json', 'w', encoding='utf-8') as f:
    json.dump(extracted_data, f, ensure_ascii=False, indent=2)

# Print the extracted data (optional)
print(json.dumps(extracted_data, ensure_ascii=False, indent=2))

{
  "form_fields": {
    "Yes": "☑",
    "No": "☑",
    "MEL Application": "020121",
    "ACORD Workers' Compensation Application": "",
    "Minimum 4 years and currently valued Loss Runs": "",
    "Description of operations": "",
    "City": "EHT",
    "W.C.:": "500,000\n$",
    "L.S.H.W.A.:": "0\n$",
    "Total gross annual payroll: $": "700,000",
    "Jones Act:": "$",
    "5. How many years has Insured been in operation?": "5",
    "Street": "2 Street Rd\n:",
    "6. Full details of Insured's overwater operations?": "Soil boring from non-owned barge",
    "State": "NJ",
    "Zip": "08234",
    "1. Full Name of Insured:": "Submission 2",
    "EHT": "City",
    "N\nNo": "",
    "No\n은": "",
    "No\nN": "☑",
    "Expiring Date": "02/24/2023",
    "Limits carried": "$ 1,000,000",
    "Limit Required": "$1,000,000",
    "Premium Charged": "$ 35,000"
  },
  "entities": {
    "generic_entities": [
      "",
      ""
    ]
  },
  "tables": [
    [],
    [
      {
        "Limits carried":

In [4]:
import re
import json
from datetime import datetime, timedelta

# OCR text provided (truncated for brevity)
ocr_text = """
Submission Requirements:
ACORD Workers' Compensation Application
Minimum 4 years and currently valued Loss Runs
Description of operations
MEL Application
MEL Application
1. Full Name of Insured: Submission 2
2 Street Rd
2. Physical Address:
EHT
NJ
08234
Street
City
State Zip
3. Insured Email Address:
4. Telephone:
Fax:
5
5. How many years has Insured been in operation?
6. Full details of Insured's overwater operations? Soil boring from non-owned barge
7. Total number of employees:
Total gross annual payroll: $ 700,000
4
8. Total number of employees exposed overwater per annum:
4
9. Total payroll for employees exposed overwater:
4
10. Maximum number of employees exposed overwater at any one time:
11. Gross payroll split for last 12 months:
Jones Act: $
0
L.S.H.W.A.: $
500,000
W.C.: $
12. Gross split for next 12 months:
Jones Act: $15,000
L.S.H.W.A.: $15,000
W.C.: $ 570,000
...
"""

import sys

# Define a function to extract information based on possible field names
def extract_field(text, field_names, after_colon=True):
    pattern = r'(?:' + '|'.join(re.escape(field) for field in field_names) + r')\s*:?(.+)'
    matches = re.findall(pattern, text, re.IGNORECASE)
    if matches:
        # Clean up and return the first match
        value = matches[0].strip()
        if after_colon:
            # Remove any labels from the value
            value = re.sub(r'^\s*[:-]\s*', '', value)
        return value.strip()
    return ''

# Initialize the result dictionary
result = {
    "Date": "",  # Current date
    "Insured": "",  # Name and address
    "Effective Start Date": "",  # Today's date
    "Effective End Date": "",  # One year from today's date
    "Deductible": "$5000",  # Static value
    "Premium Rate": "",  # Leave blank
    "Estimated Annual Payroll": "",  # Total gross annual payroll
    "Policyholder/ Applicant’s Printed Name": ""  # Name from the Insured
}

# Set current date
current_date = datetime.now().date()
result["Date"] = current_date.strftime("%Y-%m-%d")

# Set Effective Start Date
result["Effective Start Date"] = current_date.strftime("%Y-%m-%d")

# Set Effective End Date (one year from today)
effective_end_date = current_date + timedelta(days=365)
result["Effective End Date"] = effective_end_date.strftime("%Y-%m-%d")

# Extract Insured Name and Address
# We will extract lines after '1. Full Name of Insured:' and '2. Physical Address:'
insured_name_field = ['1. Full Name of Insured', 'Full Name of Insured']
insured_address_field = ['2. Physical Address', 'Physical Address']
# Split the text into lines for easier processing
lines = ocr_text.splitlines()

# Extract Insured Name
insured_name = ''
insured_address = ''
for i, line in enumerate(lines):
    if any(field.lower() in line.lower() for field in insured_name_field):
        # The name might be on the same line or the next line
        if ':' in line:
            insured_name = line.split(':', 1)[1].strip()
            if not insured_name and i + 1 < len(lines):
                insured_name = lines[i + 1].strip()
        else:
            if i + 1 < len(lines):
                insured_name = lines[i + 1].strip()
    if any(field.lower() in line.lower() for field in insured_address_field):
        # The address might be on the same line or the next few lines
        address_lines = []
        for addr_line in lines[i+1:i+5]:  # We assume address is within the next 4 lines
            if addr_line.strip():
                address_lines.append(addr_line.strip())
            else:
                break
        insured_address = ', '.join(address_lines)
        break  # Assuming only one address to find

if insured_name:
    result["Insured"] = insured_name + ', ' + insured_address
    result["Policyholder/ Applicant’s Printed Name"] = insured_name

# Extract Estimated Annual Payroll
payroll_field_names = [
    'Total gross annual payroll',
    'Total gross payroll',
    'Estimated Annual Payroll',
    'Total gross or annual payroll'
]
estimated_payroll = extract_field(ocr_text, payroll_field_names)
# Clean up the payroll value by removing any non-digit characters except dot and comma
estimated_payroll = re.sub(r'[^\d.,]', '', estimated_payroll)
# Normalize the number (remove commas)
estimated_payroll = estimated_payroll.replace(',', '')
result["Estimated Annual Payroll"] = estimated_payroll

# Now, output the result as JSON
print(json.dumps(result, indent=2))

{
  "Date": "2024-12-14",
  "Insured": "Submission 2, EHT, NJ, 08234, Street",
  "Effective Start Date": "2024-12-14",
  "Effective End Date": "2025-12-14",
  "Deductible": "$5000",
  "Premium Rate": "",
  "Estimated Annual Payroll": "700000",
  "Policyholder/ Applicant\u2019s Printed Name": "Submission 2"
}


### Extract json from MEL Application using Google Document AI - Candidate SOlution 

In [10]:
# Import necessary libraries
import os
import re
import json
from datetime import datetime, timedelta
from google.cloud import documentai_v1beta3 as documentai

# ====== Configuration ======

# Path to your service account key JSON file
service_key_path = "/home/rhythm/Documents/Code/regenai/backend/credentials/regenai-service-key.json"

# Replace the following placeholders with your actual values
project_id = "regenai"                       # Your Google Cloud project ID
location = "eu"                              # Processor location, e.g., "us", "eu"
processor_id = "41265319209e4d75"            # Your Document AI processor ID
processor_version_id = None                  # Optional: Specify if using a specific processor version
pdf_file_path = "vtest11.pdf"                 # Path to the PDF file you want to process

# =============================

# Step 1: Set the environment variable for Google Application Credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = service_key_path

# Step 2: Verify that the credentials file exists
if not os.path.isfile(service_key_path):
    raise FileNotFoundError(f"Credentials file not found at: {service_key_path}")

# Step 3: Initialize the Document AI client
client_options = {"api_endpoint": f"{location}-documentai.googleapis.com"}
client = documentai.DocumentProcessorServiceClient(client_options=client_options)

# Step 4: Define the processor name
if processor_version_id:
    # Full resource name of the processor version
    processor_name = client.processor_version_path(project_id, location, processor_id, processor_version_id)
else:
    # Full resource name of the processor
    processor_name = client.processor_path(project_id, location, processor_id)

# Step 5: Verify that the PDF file exists
if not os.path.isfile(pdf_file_path):
    raise FileNotFoundError(f"PDF file not found at: {pdf_file_path}")

# Step 6: Read the PDF file into memory
with open(pdf_file_path, "rb") as file:
    pdf_content = file.read()

# Step 7: Create a RawDocument
raw_document = documentai.RawDocument(content=pdf_content, mime_type="application/pdf")

# Step 8: Create the process request
request = documentai.ProcessRequest(
    name=processor_name,
    raw_document=raw_document,
)

# Step 9: Process the document
result = client.process_document(request=request)

# Step 10: Retrieve the document object
document = result.document

# =============================
# Extract text and parse required fields
# =============================

# Extract the full OCR text from the document
ocr_text = document.text

# Initialize the result dictionary
result_data = {
    "Date": "",  # Current date
    "Insured": "",  # Name and address
    "Effective Start Date": "",  # Today's date
    "Effective End Date": "",  # One year from today's date
    "Deductible": "$5000",  # Static value
    "Premium Rate": "",  # Leave blank
    "Estimated Annual Payroll": "",  # Total gross annual payroll
    "Policyholder/ Applicant’s Printed Name": ""  # Name from the Insured
}

# Set current date
current_date = datetime.now().date()
result_data["Date"] = current_date.strftime("%Y-%m-%d")

# Set Effective Start Date
result_data["Effective Start Date"] = current_date.strftime("%Y-%m-%d")

# Set Effective End Date (one year from today)
effective_end_date = current_date + timedelta(days=365)
result_data["Effective End Date"] = effective_end_date.strftime("%Y-%m-%d")

# Define a function to extract information based on possible field names
def extract_field(text, field_names, num_lines=1):
    lines = text.splitlines()
    for i, line in enumerate(lines):
        for field in field_names:
            if field.lower() in line.lower():
                # Extract the value(s) from the next 'num_lines' lines
                value_lines = []
                # Check if the value is on the same line after a colon
                if ':' in line:
                    value = line.split(':', 1)[1].strip()
                    if value:
                        value_lines.append(value)
                # If not, get the next 'num_lines' lines
                if not value_lines:
                    for j in range(1, num_lines + 1):
                        if i + j < len(lines):
                            next_line = lines[i + j].strip()
                            if next_line:
                                value_lines.append(next_line)
                            else:
                                break
                return ' '.join(value_lines).strip()
    return ''

# Extract Insured Name and Address
insured_name_field = [
    '1. Full Name of Insured',
    'Full Name of Insured',
    'Name of Insured',
    'Insured Name'
]
insured_address_field = [
    '2. Physical Address',
    'Physical Address',
    'Address'
]

# Extract Insured Name
insured_name = extract_field(ocr_text, insured_name_field)
result_data["Insured"] = insured_name
result_data["Policyholder/ Applicant’s Printed Name"] = insured_name

# Extract Insured Address (assume address spans up to 4 lines)
insured_address = extract_field(ocr_text, insured_address_field, num_lines=4)
if insured_address:
    result_data["Insured"] += ', ' + insured_address

# Extract Estimated Annual Payroll
payroll_field_names = [
    'Total gross annual payroll',
    'Total gross payroll',
    'Estimated Annual Payroll',
    'Total gross or annual payroll',
    'Total All'
]
estimated_payroll = extract_field(ocr_text, payroll_field_names)
# Clean up the payroll value by removing any non-digit characters except dot and comma
estimated_payroll = re.sub(r'[^\d.,]', '', estimated_payroll)
# Normalize the number (remove commas)
estimated_payroll = estimated_payroll.replace(',', '')
result_data["Estimated Annual Payroll"] = estimated_payroll

# Output the result as JSON
print(json.dumps(result_data, indent=2))

{
  "Date": "2024-12-14",
  "Insured": ", Los Alamitos California 90720 3",
  "Effective Start Date": "2024-12-14",
  "Effective End Date": "2025-12-14",
  "Deductible": "$5000",
  "Premium Rate": "",
  "Estimated Annual Payroll": "142000",
  "Policyholder/ Applicant\u2019s Printed Name": ""
}
