In [2]:
import pandas as pd
from langchain.llms import OpenAI
from pypdf import PdfReader
import re
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType


import openai
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai.api_key  = os.environ['OPENAI_API_KEY'] 

In [5]:

def main():
    print("Bill Extractor AI Assistant...")

    # Upload Bills
    print("Upload your bills in PDF format only")
    pdf_files = ["bill1.pdf", "bill2.pdf", "bill3.pdf"]  # Replace with your PDF file paths

    extract_button = True  # Assume user has clicked the extract button

    if extract_button:
        try:
            data_frame = create_docs(pdf_files)
            display(data_frame.head())
            data_frame["AMOUNT"] = data_frame["AMOUNT"].str.replace(r"[^\d\-+\.]", "", regex=True)

            data_frame["AMOUNT"] = data_frame["AMOUNT"].astype(float)
            print("Average bill amount: ", data_frame['AMOUNT'].mean())

            # Convert to CSV
            convert_to_csv = data_frame.to_csv(index=False).encode("utf-8")

            print("Download Extracted Data as CSV")
            with open("CSV_Bills.csv", "wb") as f:
                f.write(convert_to_csv)
            print("Success!!")
        except Exception as e:  # Catch any exceptions
            print(f"An error occurred: {str(e)}")

# Extract Info from PDF file
def get_pdf_text(pdf_doc):
    text = ""
    pdf_reader = PdfReader(pdf_doc)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Extract data from text
def extract_data_from_llm(pages_data):
    template = """Extract all the following values : Invoice ID, DESCRIPTION, Issue Date,
             UNIT PRICE, AMOUNT, Bill For, From and Terms from: {pages}

             Expected output: remove any dollar symbols {{'Invoice ID': '1001329','DESCRIPTION': 'UNIT PRICE','AMOUNT': '2','Date': '5/4/2023','AMOUNT': '1100.00', 'Bill For': 'james', 'From': 'excel company', 'Terms': 'pay this now'}}
             """
    prompt_template = PromptTemplate(input_variables=["pages"], template=template)
    llm = OpenAI(temperature=0.7)  # Adjust temperature as needed
    try:
        full_response = llm(prompt_template.format(pages=pages_data))
        return full_response
    except Exception as e:  # Handle OpenAI API errors
        print(f"Error calling OpenAI: {str(e)}")
        return None

def clean_extracted_data(extracted_text):
    # Implement data cleaning logic here (e.g., handle missing values, data type conversions)
    pattern = r'{(.+)}'
    match = re.search(pattern, extracted_text, re.DOTALL)
    if match:
        extracted_text = match.group(1)
        try:
            data_dict = eval('{' + extracted_text + '}')
            return data_dict
        except Exception as e:  # Handle data parsing errors
            print(f"Failed to parse extracted data: {str(e)}")
            return None
    else:
        return None

# Create documents from the uploaded pdfs
def create_docs(user_pdf_list):
    df = pd.DataFrame({'Invoice ID': pd.Series(dtype='int'),
                       'DESCRIPTION': pd.Series(dtype='str'),
                       'Issue Date': pd.Series(dtype='str'),
                       'UNIT PRICE': pd.Series(dtype='str'),
                       'AMOUNT': pd.Series(dtype='int'),
                       'Bill For': pd.Series(dtype='str'),
                       'From': pd.Series(dtype='str'),
                       'Terms': pd.Series(dtype='str')})

    for filename in user_pdf_list:
        print(filename)
        raw_data = get_pdf_text(filename)
        llm_extracted_data = extract_data_from_llm(raw_data)

        if llm_extracted_data:
            cleaned_data = clean_extracted_data(llm_extracted_data)
            if cleaned_data:
                df = pd.concat([df, pd.DataFrame([cleaned_data])], ignore_index=True)
            else:
                print(f"Failed to clean data for file: {filename}")
        else:
            print(f"Failed to extract data from PDF: {filename}")

    print("********************DONE***************")
    return df




main()




Bill Extractor AI Assistant...
Upload your bills in PDF format only
bill1.pdf
bill2.pdf
bill3.pdf
********************DONE***************

   Invoice ID          DESCRIPTION  Issue Date UNIT PRICE     AMOUNT  \
0           2  Phone and data bill  11/27/2026    $500.00    $500.00   
1     1001329         Condo Rental  11/27/2026  $2,500.00  $2,500.00   
2        1000     Water and Sewage  11/27/2026    $134.00    $134.00   

     Bill For                                               From  \
0  Paul Regex      DR-TeleP 1583 E. TanneVa Ln Nekaspo, WE 99010   
1  Paul Regex  DR-TeleP 1583 E. TanneVa Ln Nekaspo, WE 99010 ...   
2  Paul Regex               1583 E. TanneVa Ln Nekaspo, WE 99010   

              Terms  
0  Due upon receipt  
1  Due upon receipt  
2  Due upon receipt  

Average bill amount:  1044.6666666666667
Download Extracted Data as CSV
Success!!

