## Invoice processing in bulk

In [1]:
import os
from PyPDF2 import PdfReader
from docx import Document

# Extract selected text based on condition in function visitor_body


#  Returns an array
def remove_empty_strings(string_list):
    return [string for string in string_list if string.strip()]


parts = []


def visitor_body(text, cm, tm, fontDict, fontSize):
    y = tm[5]
    # if y > 350 and y < 720: # Change this to pick Title with Big or small fonts
    if y > 1 and y < 1000:
        parts.append(text)


def extractTextPDForDOCX(file_path, filename):
    document_text = ""

    # Check if the file is a pdf document
    if filename.endswith(".pdf"):
        with open(file_path, "rb") as pdf_file:
            pdf_reader = PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)

            # Extract text from each page
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                text = page.extract_text(visitor_text=visitor_body)
                # print(text)
            string_list = remove_empty_strings(parts)

        for string in string_list:
            document_text = document_text + string + "\n"

    # Check if the file is a Word document
    elif filename.endswith(".docx"):
        doc = Document(file_path)
        # Extract text from each paragraph
        for paragraph in doc.paragraphs:
            document_text = document_text + paragraph.text + "\n"

    return document_text

In [2]:
import openai
import os

from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

openai.api_key = os.getenv("OPENAI_API_KEY")
# print(openai.api_key )

# If using OpenAI library version 1.0.0, here is the code
# openai==1.0.0
client = openai.OpenAI()

# model="gpt-3.5-turbo"
# model="gpt-4o-mini"


def get_completion(prompt, model="gpt-4o-mini"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    return response.choices[0].message.content

In [3]:
import os

# Extract text from all pdf and docx files in a folder
# Set the folder path where the files are located
folder_path = "/Users/mahtabsyed/Library/Mobile Documents/com~apple~CloudDocs/All Family Storage/1 - Career - Mahtab/UST/Clients/JPMorgan/Data/Invoices/"

invoice_text_arr = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf") or filename.endswith(".docx"):
        file_path = os.path.join(folder_path, filename)
        invoice_text = extractTextPDForDOCX(file_path, filename)
        invoice_text_arr.append(invoice_text)

for invoice_text in invoice_text_arr:
    print("**************")
    print(invoice_text)
    print("**************")

**************
McCarthy Plumbing Group
6a/8 Supertron Court
Laverton North 3026
PO Box 7207
Point Cook 3030
Tel. 03 9931 0905
www.mccarthyplumbinggroup.
com.au
ABN 29422604042
Licence # 47867
PLEASE PAY BY
06/03/2024
AMOUNT
$2,322.09
INVOICE DATE
06/03/2024
TAX INVOICE NO. 14544 - FINAL CLAIM
Mahtab Syed
32 Townley Boulevard
Werribee VIC 3030
Order No.:
Site Contact:
Mahtab Syed
Site Address:
32 Townley Boulevard
Werribee VIC 3030
Job Name:
RPZ and bidet installation
Description
06/03/2024
Attended to carry out works as per Quote No 5626
copy of test report attached, which has been forwarded to Greater Western Water 
This payment claim is made under the Building and Construction
Industry Security of Payment Act 2002
Thank you.
Sub-Total ex GST

GST

Total inc GST

Amount Applied

Balance Due

This payment claim is made under the Building and Construction Industry Security of Payment Act 2002
How To Pay
INVOICE NO. 14544
Mail
Detach this section an

In [4]:
# Process the invoice text and generate email
for invoice_text in invoice_text_arr:
    prompt = f"""
     As a Financial Analyst you are processing invoices.
     Can you extract the following information from the Invoice in '''{invoice_text}''' \
     And write a small email to the vendor to ask for the payment in a polite manner. \
     If you see it any issues let them call back.\
            
     You can follow these steps:\
     Step 1 - Extract info in JSON fromat only and use this example which you can use for the format: \
     "Name ": "XYZ"  \
     "Invoice Number": "123" \
     "Invoice Date": "2022-01-01" \
     "Amount": "1000" \
     "Due Date": "2022-02-01" \
     "Vendor Email": " \
     
     Step 2 - Write a polite email to the vendor asking for the payment by the Due date and if Overdue request to call back. \
     You can add the signature as "Invoice Processing Team @ NRMA" \  
     """
    print("##############")
    response = get_completion(prompt)
    print(response)
    print("")
    print("##############")

##############
```json
{
  "Name": "Mahtab Syed",
  "Invoice Number": "14544",
  "Invoice Date": "2024-03-06",
  "Amount": "2322.09",
  "Due Date": "2024-03-06",
  "Vendor Email": "info@mccarthyplumbinggroup.com.au"
}
```

---

Subject: Payment Reminder for Invoice #14544

Dear Mahtab Syed,

I hope this message finds you well. This is a friendly reminder regarding the payment for Invoice #14544, which is due today, March 6, 2024, in the amount of $2,322.09.

If you have already processed this payment, please disregard this message. However, if there are any issues or if you require further assistance, please do not hesitate to call us at your earliest convenience.

Thank you for your attention to this matter.

Best regards,

Invoice Processing Team  
@ NRMA

##############
