In [18]:
import google.generativeai as genai
import os
import requests
import json
from pypdf import PdfReader # For PDF text extraction
import uuid # To generate unique IDs for Solr documents

import urllib3
from urllib3.exceptions import InsecureRequestWarning
urllib3.disable_warnings(InsecureRequestWarning)

In [19]:
GEMINI_API_KEY = ""

In [21]:
if not GEMINI_API_KEY:
    print("WARNING: GEMINI_API_KEY environment variable not set. Using placeholder.")
    GEMINI_API_KEY = "YOUR_GEMINI_API_KEY_HERE" # Replace with your actual key if not using env var

In [22]:
SOLR_URL = "https://solr.roguedev.local/solr/" # Replace with your Solr URL
SOLR_CORE = "myuploads" # Replace with your Solr core/collection name
SOLR_UPDATE_URL = f"{SOLR_URL}{SOLR_CORE}/update/json/docs?commit=true"

In [23]:
# Solr Schema Fields (customize based on your Solr schema)
SOLR_ID_FIELD = "id"
SOLR_SUMMARY_FIELD = "summary"
SOLR_ORIGINAL_TEXT_FIELD = "content"
SOLR_TITLE_FIELD = "title" # Optional: if you want to store a title

In [24]:
# --- 1. Get the Document Content ---
def get_document_content(source_type, source_input):
    """
    Retrieves document content based on source type.

    Args:
        source_type (str): "pdf" for a local PDF file path, or "text" for direct string.
        source_input (str): The file path if source_type is "pdf", or the text content
                            if source_type is "text".

    Returns:
        str: The extracted text content of the document, or None if an error occurs.
    """
    document_content = None
    if source_type == "pdf":
        try:
            reader = PdfReader(source_input)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or "" # extract_text can return None
            document_content = text
            print(f"Successfully extracted text from PDF: {source_input}")
        except FileNotFoundError:
            print(f"Error: PDF file not found at {source_input}")
        except Exception as e:
            print(f"Error extracting text from PDF {source_input}: {e}")
    elif source_type == "text":
        document_content = source_input
        print("Using text from variable.")
    else:
        print(f"Error: Unsupported source type '{source_type}'. Choose 'pdf' or 'text'.")
    return document_content

In [36]:
# --- 2. Summarize with Gemini ---
def summarize_document(document_text):
    """
    Summarizes the given document text using the Gemini API.

    Args:
        document_text (str): The text content of the document to summarize.

    Returns:
        str: The summarized text, or None if an error occurs during summarization.
    """
    if not document_text:
        print("No document text provided for summarization.")
        return None

    genai.configure(api_key=GEMINI_API_KEY)
    model = genai.GenerativeModel('gemini-1.5-flash') # Or 'gemini-1.5-flash', 'gemini-1.5-pro' for larger contexts

    # Important: Consider token limits for very long documents.
    # Gemini-pro has a 30,720 token context window.
    # If your document is much longer, you might need to implement chunking
    # and summarize chunks, then summarize the summaries.
    if len(document_text) > 25000: # Rough estimate, actual tokens depend on content
        print("Warning: Document is very long. It might exceed Gemini's token limit.")
        print("Consider implementing text chunking for very large documents.")
        # For simplicity, we'll just truncate for this example if it's excessively long
        #document_text = document_text[:25000] # Truncate for demonstration
        print("Document truncated for summarization due to potential token limits.")

    prompt = f"What financial or banking services are governed by the act in the following document:\n\n{document_text}. Provide a detailed list of all the services you can find.\n\nSummary:"

    try:
        response = model.generate_content(prompt)
        # Access the text from the candidate, handling cases where it might not exist
        if response.candidates and response.candidates[0].content and response.candidates[0].content.parts:
            summary = "".join([part.text for part in response.candidates[0].content.parts])
            return summary
        else:
            print("Gemini response did not contain a valid summary.")
            return None
    except Exception as e:
        print(f"Error summarizing document with Gemini: {e}")
        # If the error is due to safety settings, you might want to log more details:
        if hasattr(e, 'response') and hasattr(e.response, 'prompt_feedback'):
            print(f"Gemini Prompt Feedback: {e.response.prompt_feedback}")
        return None


In [37]:
# --- 3. Prepare for Solr & 4. Upload to Solr ---
def upload_to_solr(doc_id, title, original_text, summary):
    """
    Constructs a Solr document and uploads it to the configured Solr instance.

    Args:
        doc_id (str): A unique ID for the document in Solr.
        title (str): The title of the document.
        original_text (str): The full original text of the document.
        summary (str): The summarized text of the document.
    """
    if not all([doc_id, original_text, summary]):
        print("Missing required data (ID, original text, or summary) for Solr upload.")
        return

    solr_document = {
        SOLR_ID_FIELD: doc_id,
        SOLR_TITLE_FIELD: title,
        SOLR_ORIGINAL_TEXT_FIELD: original_text,
        SOLR_SUMMARY_FIELD: summary
    }

    headers = {"Content-Type": "application/json"}
    SOLR_CERT_PATH = "/home/junior/Downloads/localcert"
    try:
        response = requests.post(SOLR_UPDATE_URL, data=json.dumps([solr_document]), headers=headers, verify=False)
        response.raise_for_status() # Raise an exception for HTTP errors
        print(f"Successfully uploaded document '{doc_id}' to Solr. Status: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error uploading document '{doc_id}' to Solr: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"Solr Response Error: {e.response.text}")
        else:
            print("No detailed Solr response available.")

In [38]:
pdf_file_path = "/home/junior/Downloads/Banking-Act (Website Download).pdf"
if not os.path.exists(pdf_file_path):
    print(f"\n--- Creating a dummy PDF for demonstration at '{pdf_file_path}' ---")
    try:
        from reportlab.lib.pagesizes import letter
        from reportlab.pdfgen import canvas
        c = canvas.Canvas(pdf_file_path, pagesize=letter)
        c.drawString(100, 750, "This is a sample PDF document for testing purposes.")
        c.drawString(100, 730, "It contains some text that Gemini will summarize.")
        c.drawString(100, 710, "The quick brown fox jumps over the lazy dog.")
        c.drawString(100, 690, "Botswana is a beautiful country in Southern Africa.")
        c.save()
        print("Dummy PDF created.")
    except ImportError:
        print("Warning: reportlab not installed. Cannot create dummy PDF.")
        print("Please install it: pip install reportlab or manually provide a PDF.")
        print("Skipping PDF example.")
        pdf_file_path = None # Disable PDF example if reportlab is not there
        
if pdf_file_path and os.path.exists(pdf_file_path):
    doc_id_2 = str(uuid.uuid4()) # Generate another unique ID
    document_title_2 = "Sample PDF Document"
        
    print(f"\n--- Processing Document: '{document_title_2}' (from PDF file) ---")
    document_content_2 = get_document_content("pdf", pdf_file_path)

    if document_content_2:
        summary_2 = summarize_document(document_content_2)
        if summary_2:
            print("\nGemini Summary:")
            print(summary_2)
            upload_to_solr(doc_id_2, document_title_2, document_content_2, summary_2)
        else:
            print("Summary could not be generated for document 2.")
    else:
        print("Could not extract content from PDF for document 2.")


--- Processing Document: 'Sample PDF Document' (from PDF file) ---
Successfully extracted text from PDF: /home/junior/Downloads/Banking-Act (Website Download).pdf
Consider implementing text chunking for very large documents.
Document truncated for summarization due to potential token limits.

Gemini Summary:
The Botswana Banking Act of 1995 governs a wide range of financial and banking services.  Based on the provided text, these services include:

**I. Core Banking Services:**

* **Accepting Deposits:** This encompasses deposits of money repayable on demand, after fixed periods, or after notice; accepted by cheque or other means.  This is explicitly defined as part of "banking business."

* **Making Loans and Advances:**  This involves extending loans, advances, overdrafts, and other similar credit facilities.  The Act specifies limitations on the amount of unsecured lending and lending to related parties.

* **Investment Activities:** Employing deposits in investments is another cor