In [None]:
# @title Install Libraries

In [None]:
%pip install transformers pytesseract pdfminer.six python-docx pandas
%pip install PyPDF2
%pip install tesseract
%pip install llama_index
%pip install langchain-community
%pip install faiss-cpu
%pip install --upgrade --quiet  langchain-google-genai



In [None]:
# @title Set API Key

In [None]:
import os
import json

# Load the JSON file
with open('/content/key.json', 'r') as file:
    config = json.load(file)
# Set the environment variable
os.environ["GOOGLE_API_KEY"] = config["GOOGLE_API_KEY"]

In [None]:
# @title Define Functions to read pdf, image and docx files

In [None]:
import pytesseract
from PIL import Image
from PyPDF2 import PdfReader
import docx
import pandas as pd
from transformers import pipeline
from pathlib import Path

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    pdf_text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        pdf_text += page.extract_text()
    return pdf_text.strip()

# Function to extract text from Image
def image_format(image_path):
    img = Path(image_path)

    if not img.exists():
        raise FileNotFoundError(f"Could not find image: {img}")

    image_parts = [
        {
            "mime_type": "image/png",
            "data": img.read_bytes()
        }
    ]
    return image_parts

# Function to extract text from a Word document
def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs]).strip()

In [None]:
# @title Setup Gemini model

In [None]:
# from llama_index.core.text_splitter import TokenTextSplitter
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# import faiss
# from langchain_community.vectorstores import FAISS
import google.generativeai as genai

def get_model(data):
  # text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=200)
  # chunks = text_splitter.split_text(data)
  # embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
  # vector_store = FAISS.from_texts(chunks, embedding=embeddings)
  # vector_store.save_local("faiss_index")

  safety_settings = [
    {
      "category": "HARM_CATEGORY_HARASSMENT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
      "category": "HARM_CATEGORY_HATE_SPEECH",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
      "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
      "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    }
  ]

  model = genai.GenerativeModel('gemini-1.5-flash',safety_settings = safety_settings)
  config={
      "max_output_tokens": 2048,
      "temperature": 0,
      "top_p": 1,
      "top_k": 32
  }

  return model,config

In [None]:
# @title Define Prompt

In [None]:
prompt = """
You are an advanced document processing system.Handle all kind of text and convert to json.
Your task is to extract and organize data from the provided document into a structured format suitable for conversion into a DataFrame.
Instructions:
Identify and extract relevant information such as:Invoice Date,Total Amount,Item Descriptions,
Any other important details. also include other details if you found from documents.
Also In case of, document text have general information realted to Industry use cases etc.
then convert text into a structured JSON format, organizing it by industry use cases
and their associated tasks:
Return the extracted data in JSON format,
ensuring that each key accurately represents the corresponding piece of information,
and that values are correctly paired.
Also return answer to user questions.
Only give data from provided text only. Do not include internet or random data.
The JSON structure should be well-formed, with keys that are appropriately named for easy conversion into a DataFrame.
"""

In [None]:

def generate(prompt):
    input = prompt

    responses = model.generate_content(
        input,
        generation_config= config,
        stream=True,
    )
    full_response = ""

    for response in responses:
        full_response += response.text

    return full_response

genai.configure(api_key='AIzaSyClTTazoJymSqX37nuqAoLz05Ox6rd3BrQ')

def gemini_output(data,prompt,user_question):
    input_prompt= [prompt, data,user_question]
    response = generate(input_prompt)
    return response

In [None]:
import io
import pandas as pd
from google.colab import files
import json

def upload_file():
  print("Choose a single file")
  uploaded = files.upload()
  filename=[key for key in uploaded.keys()][0]

  return filename

def get_info(filename,user_question):
  file_path = '/content/' + filename
  model, config, data = None, None, None
  if filename.endswith('.pdf'):
    pdf_text = extract_text_from_pdf(file_path)
    model,config = get_model(pdf_text)
    data = gemini_output(pdf_text, prompt,user_question)
    # cleaned_response = full_response.strip('```json')
    # response_json = json.loads(str(cleaned_response))
    # df = pd.DataFrame(response_json)
    # display(df)
  elif filename.endswith(('.png', '.jpg', '.jpeg')):
    image_info = image_format(file_path)
    model,config = get_model(image_info)
    input_prompt= [prompt, image_info[0], user_question]
    data = model.generate_content(input_prompt)
    data = data.text
  elif filename.endswith('.docx'):
    text_data = extract_text_from_docx(file_path)
    if text_data:
      model, config = get_model(text_data)
      data = gemini_output(text_data, prompt,user_question)
  else:
      print("Failed to extract text from the document.")
  if data is None:
      print("No data was generated or extracted.")

  return data

In [None]:
# @title Operations for pdf file

In [None]:
filename = upload_file()

Choose a single file


Saving Bill-4963276.pdf to Bill-4963276 (14).pdf


In [None]:
user_question = "What is total amount?"
data = get_info(filename,user_question)
from IPython.display import Markdown
Markdown(data)

2024-09-07 11:44:57.614 200 POST /v1beta/models/gemini-1.5-flash:streamGenerateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 9542.27ms


```json
{
  "Invoice_Details": {
    "Invoice_Number": "74485",
    "Invoice_Period": "July2024 To September2024",
    "Invoice_Date": "01 Jul 2024",
    "Due_Date": "30 Jul 2024",
    "Total_Amount": "16,788.85"
  },
  "Owner_Details": {
    "Owner_Name": "LT COL NAIR PK CHANDRAHASAN",
    "Owner_Address": null,
    "House": "E18-0903",
    "Area": "2055"
  },
  "Tenant_Details": {
    "Tenant_Name": "Sonal Singh",
    "Tenant_Address": null
  },
  "Society_Details": {
    "Society_Name": "Sandeep Vihar Owners Welfare Association",
    "Society_Address": "Sandeep Vihar, Kannamangala P.O. Bangalore,Bangalore,Karnataka-560115",
    "Phone": "+91 80 49562570",
    "Society_Reg_No": "DRB-3/SOR/497/2016-17",
    "GST_Number": "29AAPAS4383L1ZK",
    "PAN_Number": "AAPAS4383L"
  },
  "Maintenance_Charges": [
    {
      "Description": "SVOWA, MAINTENANCE CHARGES",
      "HSN/SAC": null,
      "Amount": "15,234.00"
    },
    {
      "Description": "SVCC, MAINTENANCE CHARGES",
      "HSN/SAC": null,
      "Amount": "1,350.00"
    },
    {
      "Description": "Penalty/Interest for batch id 4556079 inv #69528 (calculation: 01-05-2024 to 25-05-2024 is Rs 204.85 )",
      "HSN/SAC": null,
      "Amount": "204.85"
    }
  ],
  "Payment_Instructions": {
    "Option_1": {
      "Method": "MyGate App",
      "Steps": [
        "Go To MyGate App OR Click on the link given in the Invoice Email that you received.",
        "Click on COMMUNITY button.",
        "Next Screen, click on SOCIETY DUES",
        "You will see MMC Invoice(s) with NEW mentioned in blue against it. It will have a PAY NOW button. Click that.",
        "You will now see the TOTAL AMOUNT and PAYMENT TYPE (UPI/NetBanking/Debit Card/Credit Card/Wallets). Click your choice and CHECKOUT.",
        "Complete the payment cycles including OTP etc.",
        "Automatically, it will generate a receipt and send it to your email ID and simultaneously update your Apartment Account Books."
      ]
    },
    "Option_2": {
      "Method": "SBI Collect Portal",
      "Steps": [
        "Go To https://www.onlinesbi.com/sbicollect/icollecthome.htm?corpID=3762566 which is the SBI Collect Portal",
        "Click on I Accept button and then click on \"PROCEED\" button",
        "Next Page enter State of Corporate/ Institution= Karnataka (from drop down) and Type of Corporate/ Institution = Others (from drop down)",
        "Next page choose in the Others Name dropdown value= \"Sandeep Vihar Apartment Owners Association\". Click SUBMIT",
        "Next Screen is the Sandeep Vihar page , in dropdown menu select  Maintenance Dues MMC",
        "Select your TOWER. Then Select your Apartment No. Remember Penthouse owners must not use 1301 to 1404 , instead they have to use Penthouse references 0A01, 0B02, 0C01 etc.",
        "Enter the remaining details including AMOUNT",
        "Remember to give your Mobile No, Name, Email ID etc if you have to get a receipt. Click SUBMIT",
        "On the next page choose the payment option and make the payment. Complete the payment cycles including OTP etc.",
        "Ensure you record the DU Reference generated..",
        "Automatically, it will generate a receipt and send it to your email ID."
      ]
    }
  },
  "Notes": "Dues not paid within stipulated time are subject to Penalty/Interest as per Society laws."
}
```

**Answer to the user question:**

The total amount is **16,788.85**. 


In [None]:
# @title Operations for pdf file

In [None]:
filename = upload_file()
user_question = "What is total amount?"
data = get_info(filename,user_question)
from IPython.display import Markdown
Markdown(data)

Choose a single file


Saving I2425_67766393_1723724302835.pdf to I2425_67766393_1723724302835 (1).pdf


2024-09-07 11:45:50.625 200 POST /v1beta/models/gemini-1.5-flash:streamGenerateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 5742.23ms


```json
{
  "Invoice Details": {
    "Invoice Date": "15/08/2024",
    "Invoice No": "I2425/67766393",
    "Bill to": "SONAL SINGH",
    "Total Amount": "252.00",
    "Total invoice value (In words)": "Rs Two Hundred Fifty-two Only",
    "Items": [
      {
        "S.No": 1,
        "Item Description": "E-Recharge",
        "SAC": "99715821",
        "Amount": "211.02",
        "Discount": "--",
        "Taxable Value": "211.02",
        "IGST": "1837.98",
        "CGST": "00.00",
        "SGST": "00.00"
      },
      {
        "S.No": 2,
        "Item Description": "Platform Fee",
        "SAC": "999799",
        "Amount": "2.54",
        "Discount": "--",
        "Taxable Value": "2.54",
        "IGST": "180.46",
        "CGST": "00.00",
        "SGST": "00.00"
      }
    ]
  },
  "Company Details": {
    "Company Name": "PhonePe Private Limited",
    "GSTIN": "29AACCF1132H2ZX",
    "CIN": "U67190MH2012PTC337657",
    "PAN": "AACCF1132H",
    "STATE CODE": "29",
    "Address": "4th,5th and 6th Floor, 80/1, 81/1 and 81/2, Salarpuria Softzone Varthur Hobli Bangalore Karnataka - 560037"
  },
  "Payment Details": {
    "Transaction Id": "NX24081517345611390667191",
    "Transaction Date": "Thu Aug 15 17:35:00 IST 2024",
    "Amount of payment": "252.00",
    "Mode of payment": "[UPI]"
  }
}
```

**Answer to user question:**

The total amount is Rs 252.00. 


In [None]:
# @title Operations for pdf file

In [None]:
user_question = "What is total amount?"
pdf_text = extract_text_from_pdf('/content/MH011516161-MHW24OCS0192936 (4).PDF')
model,config = get_model(pdf_text)
data = gemini_output(pdf_text, prompt,user_question)
Markdown(data)

2024-09-07 11:55:14.593 200 POST /v1beta/models/gemini-1.5-flash:streamGenerateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 4201.83ms


```json
{
  "Invoice_Details": {
    "GSTIN": "29AAGCM5933R2ZK",
    "Hospital_No": "MH011516161",
    "Name": "MRS MANJU LATA SINGH",
    "Age/Sex": "73/Female",
    "Address": "Sandeep Vihar",
    "OPD_No": "O01001888633",
    "OPD_Date": "12/08/2024",
    "Bill_No": "MHW24OCS0192936",
    "Bill_Date": "12/08/2024 06:00PM",
    "Department": "INTERNAL MEDICINE",
    "Doctor": "Dr Adithi Nagaraju"
  },
  "Items": [],
  "Total_Amount": "0.00",
  "Amount_Paid": "0.00",
  "Generated_By": "Kavitha S"
}
```

**Answer to the question:** The total amount is 0.00. 


In [None]:
# @title Operations for docx file

In [None]:
filename = upload_file()
user_question = "Tell me about Healthcare industry"
data = get_info(filename,user_question)
from IPython.display import Markdown
Markdown(data)

Choose a single file


Saving Applicability of assignment in differenct sectors.docx to Applicability of assignment in differenct sectors (3).docx


2024-09-07 11:51:41.404 200 POST /v1beta/models/gemini-1.5-flash:streamGenerateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 3746.29ms


```json
{
  "Healthcare": {
    "Industry Use Cases": [
      "Medical Record Digitization",
      "Clinical Research",
      "Insurance Claims Processing"
    ],
    "Tasks": [
      "Converting unstructured medical records (e.g., doctor's notes, prescriptions) into structured data for electronic health records (EHRs).",
      "Extracting relevant information from medical literature and patient records for research studies.",
      "Automating the extraction of information from insurance claims to expedite the claims process."
    ]
  }
}
```

**Answer to your question:**

The Healthcare industry utilizes data structurization and extraction for various purposes, including:

* **Medical Record Digitization:** Converting handwritten or scanned medical records into structured data for electronic health records (EHRs). This allows for easier access, analysis, and sharing of patient information.
* **Clinical Research:** Extracting relevant information from medical literature and patient records for research studies. This helps researchers identify trends, develop new treatments, and improve patient care.
* **Insurance Claims Processing:** Automating the extraction of information from insurance claims to expedite the claims process. This reduces manual effort and improves efficiency in processing claims. 


In [None]:
# @title Operations for Image

In [None]:
filename = upload_file()
user_question = "What is total amount?"
data = get_info(filename,user_question)
from IPython.display import Markdown
Markdown(data)

Choose a single file


Saving invoice-template-us-neat-750px.png to invoice-template-us-neat-750px (5).png


2024-09-07 11:51:58.092 200 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 4808.13ms


```json
{
  "invoice_details": {
    "Invoice_Date": "11/02/2019",
    "Invoice_Number": "US-001",
    "P.O. Number": "2312/2019",
    "Due_Date": "26/02/2019",
    "Bill_To": "John Smith\n2 Court Square\nNew York, NY 12210",
    "Ship_To": "John Smith\n3787 Pineview Drive\nCambridge, MA 12210",
    "Total_Amount": "$154.06"
  },
  "item_details": [
    {
      "QTY": 1,
      "Description": "Front and rear brake cables",
      "Unit_Price": 100.00,
      "Amount": 100.00
    },
    {
      "QTY": 2,
      "Description": "New set of pedal arms",
      "Unit_Price": 15.00,
      "Amount": 30.00
    },
    {
      "QTY": 3,
      "Description": "Labor 3hrs",
      "Unit_Price": 5.00,
      "Amount": 15.00
    }
  ],
  "other_details": {
    "Subtotal": 145.00,
    "Sales_Tax": "6.25%",
    "Sales_Tax_Amount": 9.06
  },
  "terms_and_conditions": "Payment is due within 15 days\nPlease make checks payable to: East Repair Inc."
}
```
The total amount is $154.06.