In [1]:
import os
import time
import requests
import fitz  # PyMuPDF
import google.generativeai as genai

In [32]:
SOLR_URL = "https://solr.roguedev.local/solr/financialservices"
GEMINI_API_KEY = "AIzaSyAsqtn9EYWn_3yndh61_iDo51h6o3roR7s"

In [33]:
def upload_pdf_to_solr(doc_id, pdf_path, verify_ssl=False):
    url = f"{SOLR_URL}/update/extract?commit=true&literal.id={doc_id}&uprefix=attr_&fmap.content=content"
    with open(pdf_path, "rb") as f:
        files = {'myfile': f}
        response = requests.post(url, files=files, verify=verify_ssl)
    response.raise_for_status()
    print(f"Uploaded PDF '{pdf_path}' to Solr as ID '{doc_id}'")

In [34]:
def get_extracted_text(doc_id, verify_ssl=False):
    url = f"{SOLR_URL}/select"
    params = {
        "q": f"id:{doc_id}",
        "fl": "attr_content",
        "wt": "json"
    }
    # Wait a moment for Solr to commit the document
    time.sleep(2)
    response = requests.get(url, params=params, verify=verify_ssl)
    response.raise_for_status()
    
    docs = response.json()["response"]["docs"]
    if not docs:
        raise Exception(f"No document found with ID '{doc_id}'")
    text = docs[0].get("attr_content", "")
    if isinstance(text, list):
        text = " ".join(text)
    return text

In [35]:
def update_solr_with_summary(doc_id, summary, verify_ssl=False):
    url = f"{SOLR_URL}/update?commit=true"
    data = [
        {
            "id": doc_id,
            "summary": {"set": summary}
        }
    ]
    headers = {"Content-Type": "application/json"}
    response = requests.post(url, json=data, headers=headers, verify=verify_ssl)
    response.raise_for_status()
    print(f"Updated document ID '{doc_id}' with summary")

In [38]:
# --- 2. Summarize with Gemini ---
def summarize_document(document_text):
    """
    Summarizes the given document text using the Gemini API.

    Args:
        document_text (str): The text content of the document to summarize.

    Returns:
        str: The summarized text, or None if an error occurs during summarization.
    """
    if not document_text:
        print("No document text provided for summarization.")
        return None

    genai.configure(api_key=GEMINI_API_KEY)
    model = genai.GenerativeModel('gemini-1.5-flash') # Or 'gemini-1.5-flash', 'gemini-1.5-pro' for larger contexts

    prompt = f"What financial or banking services are governed by the act in the following document:\n\n{document_text}. Provide a detailed list of all the services you can find.\n\nSummary:"

    try:
        response = model.generate_content(prompt)
        # Access the text from the candidate, handling cases where it might not exist
        if response.candidates and response.candidates[0].content and response.candidates[0].content.parts:
            summary = "".join([part.text for part in response.candidates[0].content.parts])
            return summary
        else:
            print("Gemini response did not contain a valid summary.")
            return None
    except Exception as e:
        print(f"Error summarizing document with Gemini: {e}")
        # If the error is due to safety settings, you might want to log more details:
        if hasattr(e, 'response') and hasattr(e.response, 'prompt_feedback'):
            print(f"Gemini Prompt Feedback: {e.response.prompt_feedback}")
        return None

In [18]:
upload_pdf_to_solr("banking_act", "/home/junior/Downloads/Banking-Act (Website Download).pdf")



Uploaded PDF '/home/junior/Downloads/Banking-Act (Website Download).pdf' to Solr as ID 'banking_act'


In [30]:
content = get_extracted_text("banking_act")



In [31]:
content

' \n \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n Microsoft Word - Banking Act 1995.doc \n \n    \n Supplement A - Botswana Government Gazette dated 7th July, 1995 \n \n BANKING ACT 1995 \n \n NO.13 of 1995 \n \n  \n \n SECTION                                            ARRANGEMENT OF SECTIONS \n \n PART I - Preliminary \n \n 1. Short title and commencement \n \n 2. Interpretation \n \n PART II - Licencing of Banks \n \n 3. Authority to transact banking business \n \n 4. Representative office \n \n 5. Investigation of unlicenced banking \n \n 6 Application for banking licence \n \n 7. Appeal to the Minister \n \n 8. Conditions for issuing banking licence \n \n 9. General conditions \n \n 10. Power to vary conditions of licences \n \n 11. Revocation and surrender of licences \n \n 12. Procedure in case of urgency \n \n                               PART I

In [36]:
summary = summarize_document(content)

Consider implementing text chunking for very large documents.
Document truncated for summarization due to potential token limits.


In [37]:
summary

"The Botswana Banking Act of 1995 governs a wide range of financial and banking services.  Based on the provided text, these services can be categorized as follows:\n\n**I. Core Banking Services:**\n\n* **Acceptance of Deposits:** This is explicitly defined in the act as including deposits repayable on demand, after fixed periods, or after notice; accepted by check or other means.  This covers various deposit accounts, including demand deposits (checking accounts), savings accounts, and time deposits.\n\n* **Extension of Credit:** This encompasses loans, advances, overdrafts, and other similar credit facilities.  The act details regulations and limitations on these, including unsecured loans, limits on lending to individuals and related parties, and requirements for security.\n\n* **Investment Activities:**  The act allows banks to make investments, though it specifies limitations and restrictions on certain types of investments, such as those in the bank's own shares or those of other

In [39]:
update_solr_with_summary("banking_act", summary, False)

Updated document ID 'banking_act' with summary


