<a href="https://colab.research.google.com/github/mehdihoore/DocuSage/blob/main/astraphilo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install astrapy langchain-community openai pypdf python-dotenv docx2txt tiktoken nltk sentence-transformers transformers torch hazm

In [None]:
!pip install --upgrade astrapy

In [None]:
!pip install astrapy langchain-community openai pypdf python-dotenv docx2txt tiktoken nltk sentence-transformers transformers torch hazm
import os
import json
import uuid
from pathlib import Path
from typing import Any, List, Tuple, Optional
from dataclasses import dataclass
from pypdf import PdfReader
from langchain.docstore.document import Document
from langchain.vectorstores import VectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from astrapy import DataAPIClient
import re
from google.colab import userdata
from astrapy.constants import VectorMetric
from astrapy.database import Database
from astrapy.collection import Collection
from astrapy.exceptions import *
import time
import google.generativeai as genai
import requests
import pandas as pd
import csv


# Constants and configurations
TEXT_FILE_TYPES = ["txt", "docx", "pdf"]
ASTRA_DB_APPLICATION_TOKEN = userdata.get("PHILO_API")
ASTRA_DB_API_ENDPOINT = userdata.get("PHILO_END")
GEMINI_API_KEY = userdata.get("GOOGLE_API_KEY_BHR1").strip()
GEMINI_EMBEDDING_MODEL_NAME = "models/text-embedding-004"
GEMINI_TEXT_MODEL_NAME = "gemini-1.5-flash-8b"

# Cloudflare Workers AI configuration
CLOUDFLARE_API_BASE_URL = "https://api.cloudflare.com/client/v4/accounts/67f16cc2dc0c1850198f5fd061d3cdf2/ai/run/"
CLOUDFLARE_API_TOKEN = "uSjuZf1K9IydTB-1aHfoHkqGApCyjHJht-mp-sA6" # Make sure to replace this with your actual token

CLOUDFLARE_MODEL = "@cf/meta/llama-3-8b-instruct"

# CSV file configuration
CSV_FILE_PATH = "/content/drive/MyDrive/processed_data.csv"
CSV_COLUMNS = ["doc_id", "doc_name", "chunk_number", "content", "embedding", "keywords", "summary", "references"]

# --- Error Handling ---
class EmbeddingError(Exception):
    pass

class AstraDBError(Exception):
    pass

class FileProcessingError(Exception):
    pass

class SummarizationError(Exception):
    pass
class KeywordExtractionError(Exception):
    pass

class CSVError(Exception):
    pass

# --- PDF Loader (using PyPDFLoader from LangChain for simplicity) ---
class PDFLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.loader = PyPDFLoader(file_path)

    def load(self) -> List[Document]:
        try:
            return self.loader.load_and_split()
        except Exception as e:
            raise FileProcessingError(f"Error loading PDF: {e}")

@dataclass
class Data:
    content: str = ""
    references: str = ""

# --- Document Processor ---
class DocumentProcessor:
    def __init__(self, path: str, silent_errors: bool = False):
        self.path = path
        self.silent_errors = silent_errors
        self.status = ""

    def resolve_path(self, path: str) -> str:
        return os.path.abspath(path)

    def load_file(self) -> Tuple[Data, str]:
        if not self.path:
            raise ValueError("Please upload a file to use this component.")

        resolved_path = self.resolve_path(self.path)
        extension = Path(resolved_path).suffix[1:].lower()

        if extension not in TEXT_FILE_TYPES:
            raise ValueError(f"Unsupported file type: {extension}")

        if extension == "docx":
            loader = Docx2txtLoader(resolved_path)
        elif extension == "pdf":
            loader = PDFLoader(resolved_path)
        else:
            loader = TextLoader(resolved_path)

        try:
            data_list = loader.load()
        except Exception as e:
            raise FileProcessingError(f"Error loading file: {e}")

        if isinstance(data_list, list) and len(data_list) > 0:
            combined_text = " ".join([doc.page_content for doc in data_list])
            data = Data(content=combined_text)
            return data, Path(resolved_path).stem
        else:
            raise FileProcessingError("No data loaded from file")

# --- Text Splitter ---
class RecursiveCharacterTextSplitterComponent:
    def __init__(self):
        self.splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", "\n", ". ", " "],
            chunk_size=5000,
            chunk_overlap=500
        )

    def split_text(self, text: str) -> List[str]:
        return self.splitter.split_text(text)

# --- Gemini Embeddings ---
class GeminiEmbeddingsComponent:
    def __init__(self, gemini_api_key: str, max_retries: int = 5, initial_retry_delay: int = 60, backoff_factor: float = 2.0):
        genai.configure(api_key=gemini_api_key)
        self.embedding_model = GEMINI_EMBEDDING_MODEL_NAME
        self.text_model = GEMINI_TEXT_MODEL_NAME
        self.max_retries = max_retries
        self.retry_delay = initial_retry_delay
        self.backoff_factor = backoff_factor
        self.cloudflare_models = [
            {"name": "@cf/meta/llama-3.2-1b-instruct", "endpoint": CLOUDFLARE_API_BASE_URL},
            {"name": "@cf/meta/llama-3.3-70b-instruct-fp8-fast", "endpoint": CLOUDFLARE_API_BASE_URL},
            {"name": "@cf/meta/llama-3.1-8b-instruct-fast", "endpoint": CLOUDFLARE_API_BASE_URL},
            {"name": "@cf/google/gemma-7b-it-lora", "endpoint": CLOUDFLARE_API_BASE_URL},
            {"name": "@cf/google/gemma-13b-it-lora", "endpoint": CLOUDFLARE_API_BASE_URL}
        ]
        self.current_model_index = 0
    def _rotate_cloudflare_model(self):
        """Rotate to the next Cloudflare AI model."""
        self.current_model_index = (self.current_model_index + 1) % len(self.cloudflare_models)
        return self.cloudflare_models[self.current_model_index]
    def _call_cloudflare_ai(self, prompt: str, task: str = "keywords") -> str:
        """
        Call Cloudflare AI with a specific model and prompt.

        :param prompt: The input prompt for the AI
        :param task: Either "keywords" or "summary"
        :return: AI-generated response
        """
        # Select the current model
        current_model = self.cloudflare_models[self.current_model_index]

        headers = {"Authorization": f"Bearer {CLOUDFLARE_API_TOKEN}"}

        # Prepare system and user messages based on the task
        if task == "keywords":
            system_message = "You are a helpful assistant that extracts keywords from text."
            user_message = f"Extract 5-7 most significant keywords from the text:\n{prompt[:2000]}"
        else:  # summary
            system_message = "You are a helpful assistant that summarizes text."
            user_message = f"Provide a concise summary of the text:\n{prompt[:2000]}"

        inputs = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]

        input_data = {"messages": inputs}

        try:
            response = requests.post(
                f"{current_model['endpoint']}{current_model['name']}",
                headers=headers,
                json=input_data
            )
            response.raise_for_status()
            result = response.json()

            if result and result.get("result") and result["result"].get("response"):
                return result["result"]["response"].strip()
            else:
                # If no valid response, rotate to next model and retry
                print(f"No response from {current_model['name']}. Rotating models.")
                self._rotate_cloudflare_model()
                raise Exception("No valid response from current model")

        except requests.exceptions.RequestException as e:
            print(f"Error calling {current_model['name']}: {e}")
            # Rotate to next model and re-raise the exception
            self._rotate_cloudflare_model()
            raise

    def _retry_on_exception(self, func, *args, **kwargs):
        """Helper function for retries with exponential backoff."""
        retries = 0
        current_delay = self.retry_delay

        while retries < self.max_retries:
            try:
                return func(*args, **kwargs)
            except Exception as e:
                print(f"Attempt {retries + 1} failed: {e}")
                retries += 1  # Increment retry counter

                # Check if the error is related to rate limiting or other retryable issues
                if 'quota' in str(e).lower() or isinstance(e, genai.types.generation_types.BlockedPromptException):
                    print(f"Waiting {current_delay} seconds before retry...")
                    time.sleep(current_delay)
                    current_delay *= self.backoff_factor  # Exponential backoff
                else:
                    print("Non-retryable error encountered. Skipping retries.")
                    break  # Exit retry loop for non-retryable errors
        # If max retries are reached or a non-retryable error occurred, raise the exception
        raise EmbeddingError("Max retries reached for embedding generation or non-retryable error occurred")

    def extract_keywords(self, text: str) -> List[str]:
        """Extract keywords with model rotation."""
        try:
           # Try Cloudflare AI models with rotation
            for _ in range(len(self.cloudflare_models)):
                try:
                    result = self._call_cloudflare_ai(text, task="keywords")
                    keywords = [k.strip() for k in result.split(',') if k.strip()]
                    return keywords
                except Exception as model_error:
                    print(f"Model rotation error: {model_error}")

            # First try Gemini

        except Exception as e:
            print(f"Gemini keyword extraction failed: {e}. Falling back to Cloudflare Workers AI.")
            return self._extract_keywords_gemini(text)


            # If all models fail
            raise KeywordExtractionError("All models failed to extract keywords")

    """def extract_keywords(self, text: str) -> List[str]:
        #Extract keywords with retry mechanism.
        def _extract_keywords(text):
            model = genai.GenerativeModel(self.text_model)
            prompt = f"Extract 5-7 most significant keywords from the text:
            {text[:2000]}"

            response = model.generate_content(prompt)
            keywords = [k.strip() for k in response.text.split(',') if k.strip()]
            return keywords

        return self._retry_on_exception(_extract_keywords, text)"""
    def _extract_keywords_gemini(self, text: str) -> List[str]:
        """Helper function to extract keywords using the Gemini model."""
        model = genai.GenerativeModel(self.text_model)
        prompt = f"""Extract 5-7 most significant keywords from the text:
        {text[:2000]}"""

        try:
            response = model.generate_content(prompt)
            keywords = [k.strip() for k in response.text.split(',') if k.strip()]
            return keywords
        except Exception as e:
            raise KeywordExtractionError(f"Gemini keyword extraction failed: {e}")

    def _extract_keywords_cloudflare(self, text: str) -> List[str]:
        """Helper function to extract keywords using Cloudflare Workers AI."""
        headers = {"Authorization": f"Bearer {CLOUDFLARE_API_TOKEN}"}
        inputs = [
            {"role": "system", "content": "You are a helpful assistant that extracts keywords from text."},
            {"role": "user", "content": f"Extract 5-7 most significant keywords from the text:\n{text[:2000]}"}
        ]
        input_data = {"messages": inputs}
        try:
            response = requests.post(
                f"{CLOUDFLARE_API_BASE_URL}{CLOUDFLARE_MODEL}",
                headers=headers,
                json=input_data
            )
            response.raise_for_status()
            result = response.json()

            if result and result.get("result") and result["result"].get("response"):
                # Assuming the response is a comma-separated list of keywords
                keywords = [k.strip() for k in result["result"]["response"].split(',') if k.strip()]
                return keywords
            else:
                raise KeywordExtractionError(f"Unexpected response from Cloudflare Workers AI: {result}")

        except requests.exceptions.RequestException as e:
            raise KeywordExtractionError(f"Error calling Cloudflare Workers AI: {e}")


    def generate_chunk_summary(self, text: str) -> str:
        """Generate summary with model rotation."""
        try:
            for _ in range(len(self.cloudflare_models)):
                  try:
                      return self._call_cloudflare_ai(text, task="summary")
                  except Exception as model_error:
                      print(f"Model rotation error: {model_error}")
            # First try Gemini

        except Exception as e:
            print(f"Gemini summarization failed: {e}. Falling back to Cloudflare Workers AI.")
            return self._generate_gemini_summary(text)

            # Try Cloudflare AI models with rotation


            # If all models fail
            raise SummarizationError("All models failed to generate summary")

    def _generate_gemini_summary(self, text: str) -> str:
        """Helper function to generate summaries using the Gemini model."""
        model = genai.GenerativeModel(self.text_model)
        prompt = f"""Provide a concise summary of the text:
        {text[:2000]}"""
        try:
            response = model.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            raise SummarizationError(f"Gemini summarization failed: {e}")

    def _generate_cloudflare_summary(self, text: str) -> str:
        """Helper function to generate summaries using Cloudflare Workers AI."""
        headers = {"Authorization": f"Bearer {CLOUDFLARE_API_TOKEN}"}
        inputs = [
            {"role": "system", "content": "You are a helpful assistant that summarizes text."},
            {"role": "user", "content": f"Provide a concise summary of the text:\n{text[:2000]}"}
        ]
        input_data = {"messages": inputs}
        try:
            response = requests.post(
                f"{CLOUDFLARE_API_BASE_URL}{CLOUDFLARE_MODEL}",
                headers=headers,
                json=input_data
            )
            response.raise_for_status()  # Raise an exception for bad status codes
            result = response.json()

            if result and result.get("result") and result["result"].get("response"):
                return result["result"]["response"].strip()
            else:
                raise SummarizationError(f"Unexpected response from Cloudflare Workers AI: {result}")

        except requests.exceptions.RequestException as e:
            raise SummarizationError(f"Error calling Cloudflare Workers AI: {e}")

    def build_embeddings(self, texts: List[str], expected_dim: int = 768) -> List[List[float]]:
        embeddings = []
        batch_size = 5

        if not texts:
            print("No texts provided for embedding")
            return []

        if len(texts) > 100:
            print(f"Large number of texts detected. Processing in chunks of 100.")

        for i in range(0, len(texts), 100):
            batch_texts = texts[i:i + 100]

            for j in range(0, len(batch_texts), batch_size):
                batch = batch_texts[j:j + batch_size]

                try:
                    result = self._retry_on_exception(
                        genai.embed_content,
                        model=self.embedding_model,
                        content=batch,
                        task_type="retrieval_document"
                    )

                    if 'embedding' in result and result['embedding']:
                        for embedding in result['embedding']:
                            if len(embedding) == expected_dim:
                                embeddings.append(embedding)
                            else:
                                print(f"Warning: Skipping embedding with incorrect dimension (expected {expected_dim}, got {len(embedding)})")
                    else:
                        print("No embeddings returned for this batch")

                except EmbeddingError as e:
                    print(f"Failed to generate embeddings for batch: {e}")

            if i + 100 < len(texts):
                print("Pausing before processing the next chunk of 100 texts...")
                time.sleep(self.retry_delay)

        return embeddings

# --- Astra DB Manager ---
class AstraDBManager:
    def __init__(self, api_endpoint: str, token: str, collection_name: str, namespace: Optional[str] = None):
        self.api_endpoint = api_endpoint
        self.token = token
        self.collection_name = collection_name
        self.namespace = namespace or "default_namespace"

        try:
            self.client = DataAPIClient(token)
            self.database = self.client.get_database(api_endpoint)
        except (UnauthorizedException, DataAPIException, AstraPyException) as e:
            raise AstraDBError(f"Failed to initialize Astra DB client or database: {e}")



    def get_or_create_collection(self, collection_name: str, dimension: int = 768):
        try:
            print(f"Checking for collection {collection_name} in database.")
            collections = self.database.list_collections()
            if collection_name in collections:
                print(f"* Collection {collection_name} already exists.")
                return self.database.get_collection(collection_name)
            else:
                print(f"* Collection {collection_name} does not exist. Creating...")
                collection = self.database.create_collection(
                    name=collection_name,
                    dimension=dimension,
                    metric=VectorMetric.COSINE,
                )
                print(f"* Collection {collection_name} created successfully.")
                return collection
        except CollectionAlreadyExistsException:
            print(f"* Collection {collection_name} already exists. Skipping creation.")
            return self.database.get_collection(collection_name)
        except Exception as e:
            print(f"Error handling collection {collection_name}: {e}")
            raise

    def add_documents(self, embeddings: List[List[float]], doc_name: str, references: str,
                      chunks: List[str], keywords_list: List[List[str]], summaries: List[str]):
        if not isinstance(embeddings, list) or (embeddings and not isinstance(embeddings[0], list)):
            raise ValueError("Embeddings must be a list of lists")

        collection = self.get_or_create_collection(collection_name=self.collection_name)
        insertion_errors = 0
        successful_insertions = 0
        min_length = min(len(embeddings), len(chunks), len(keywords_list), len(summaries))
        batch_data = []
        for index in range(min_length):
            try:
                if len(embeddings[index]) != 768:
                    print(f"Warning: Skipping document {index} - incorrect embedding dimension")
                    continue

                content_chunk = chunks[index][:15000]

                doc_id = str(uuid.uuid4())
                document = {
                    "_id": doc_id,
                    "content": content_chunk,
                    "$vector": embeddings[index],
                    "metadata": {
                        "doc_name": doc_name,
                        "references": references,
                        "keywords": keywords_list[index],
                        "summary": summaries[index],
                        "chunk_number": index
                    }
                }

                result = collection.insert_one(document)

                if result and result.inserted_id:
                    successful_insertions += 1
                    print(f"Successfully inserted document {doc_id}")
                else:
                    insertion_errors += 1
                    print(f"Failed to insert document {doc_id}: No error message returned from Astra DB")

            except (DataAPIException, AstraPyException) as e:
                insertion_errors += 1
                print(f"Insertion error for document {index}: {e}")
            # Save the data to CSV as a backup
            try:
                self.save_to_csv(batch_data)
                print("Data saved to CSV as backup.")
            except CSVError as e:
                print(f"Failed to save data to CSV: {e}")

        print(f"Insertion Summary:")
        print(f"Total documents: {min_length}")
        print(f"Successful insertions: {successful_insertions}")
        print(f"Failed insertions: {insertion_errors}")
    def save_to_csv(self, data: List[dict]):
        """Saves the processed data to a CSV file."""
        try:
            df = pd.DataFrame(data)
            # Reorder columns to match CSV_COLUMNS
            df = df.reindex(columns=CSV_COLUMNS)

            # Check if the CSV file exists and has content
            file_exists = os.path.isfile(CSV_FILE_PATH) and os.path.getsize(CSV_FILE_PATH) > 0

            # If it exists and has content, append without header
            if file_exists:
                df.to_csv(CSV_FILE_PATH, mode='a', header=False, index=False)
            else:
                # Otherwise, create a new file with header
                df.to_csv(CSV_FILE_PATH, mode='w', header=True, index=False)

        except Exception as e:
            raise CSVError(f"Error saving data to CSV: {e}")

# --- Helper Functions ---
def extract_references(text: str) -> str:
    pattern = r'\d+-\d+-\d+-\d+'
    matches = re.findall(pattern, text)
    return matches[0] if matches else "No references found"

# --- Main Function ---
def main(file_paths: List[str]):
    text_splitter = RecursiveCharacterTextSplitterComponent()
    embeddings_component = GeminiEmbeddingsComponent(gemini_api_key=GEMINI_API_KEY)
    astradb_manager = AstraDBManager(
        api_endpoint=ASTRA_DB_API_ENDPOINT,
        token=ASTRA_DB_APPLICATION_TOKEN,
        collection_name="philosophical_texts"
    )

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        try:
            processor = DocumentProcessor(file_path)
            data, doc_name = processor.load_file()

            if not data.content:
                raise FileProcessingError(f"No content extracted from {file_path}")

            chunks = text_splitter.split_text(data.content)
            print(f"Total chunks extracted: {len(chunks)}")

            if not chunks:
                raise FileProcessingError("No chunks extracted from the document")

            embeddings = embeddings_component.build_embeddings(chunks)

            if not embeddings:
                raise EmbeddingError("No embeddings could be generated")

            keywords_list = [embeddings_component.extract_keywords(chunk) for chunk in chunks]
            summaries = [embeddings_component.generate_chunk_summary(chunk) for chunk in chunks]
            references = extract_references(data.content)
            # Add the data to Astra DB and also save to CSV as backup
            astradb_manager.add_documents(embeddings, doc_name, references, chunks, keywords_list, summaries)
            print(f"Finished processing file: {file_path}")

        except (FileProcessingError, EmbeddingError, AstraDBError, SummarizationError, CSVError) as e:
            print(f"Error processing {file_path}: {e}")

if __name__ == "__main__":
    file_paths = [
        #"/content/drive/MyDrive/بریده کتابها/پرونده هگل/
         #(suhrkamp taschenbuch wissenschaft) Georg Wilhelm Friedrich Hegel - Grundlinien der Philosophie des Rechts oder Naturrecht und Staatswissenschaft im Grundrisse-Suhrkamp Verlag (2000).pdf",
        #"/content/drive/MyDrive/بریده کتابها/پرونده هگل/
        #G. W. F. Hegel, Dietmar Köhler, Otto Pöggeler - Phänomenologie des Geistes-Akademie-Verlag (1998).pdf",
        #"/content/drive/MyDrive/بریده کتابها/پرونده هگل/
        #Georg Wilhelm Friedrich Hegel - Vorlesungen uber die Geschichte der Philosophie - 1. Griechische Philosophie-Digitale Bibliothek (1836) (1).pdf",
#"/content/drive/MyDrive/بریده کتابها/پرونده هگل/
        #Georg Wilhelm Friedrich Hegel - Vorlesungen uber die Geschichte der Philosophie - 1. Griechische Philosophie-Digitale Bibliothek (1836).pdf",
        #"/content/drive/MyDrive/بریده کتابها/پرونده هگل/Untitled folder/
         #(Hegel Gesammelte Werke 8) G. W. F. Hegel - Jenaer Systementwürfe III (-Felix Meiner (1976).pdf",
        #"/content/drive/MyDrive/بریده کتابها/پرونده هگل/
        #Georg Wilhelm Friedrich Hegel, Dieter Henrich (Hrsg.) - Philosophie des Rechts_ Die Vorlesung von 1819_20 in einer Nachschrift-Suhrkamp Verlag (1983).pdf",
        #"/content/drive/MyDrive/بریده کتابها/پرونده هگل/Untitled folder/
        #Georg Wilhelm Friedrich Hegel_ Walter Jaeschke (editor) - Gesammelte Werke 18. Vorlesungsmanuskripte _ 2. (1816-1831).-Meiner (1995).pdf",
        #"/content/drive/MyDrive/بریده کتابها/پرونده هگل/Untitled folder/Georg Wilhelm Friedrich Hegel, Walter Jaeschke - Vorlesungen über die Philosophie der Religion. Teil 1_ Der Begriff der Religion (Philosophische Bibliothek)-F. Meiner (1993).pdf",
        #"/content/drive/MyDrive/بریده کتابها/پرونده هگل/Untitled folder/Hegel, Georg Wilhelm Friedrich_ Hölderlin, Friedrich_ Rathgeb, Eberhard - Zwei Hälften des Lebens. Hegel und Hölderlin. Eine Freundschaft-München Blessing (2019).pdf",
#"/content/drive/MyDrive/بریده کتابها/پرونده هگل/G.W.F Hegel - Werke, Band 08 - Enzyklopädie I. 8-Suhrkamp (1989).pdf",
        #"/content/drive/MyDrive/بریده کتابها/پرونده هگل/G.W.F Hegel - Werke, Band 09 - Enzyklopädie der philosophischen Wissenschaften im Grundrisse II. 9-Suhrkamp (1986).pdf",
        #"/content/drive/MyDrive/بریده کتابها/پرونده هگل/G.W.F Hegel - Werke, Band 10 - Enzyklopädie der philosophischen Wissenschaften im Grundrisse III. 10-Suhrkamp (1986).pdf"
        "/content/drive/MyDrive/بریده کتابها/پرونده هگل/Georg Wilhelm Friedrich Hegel - Gesammelte Werke 10-2. Nürnberger Gymnasialkurse und Gymnasialreden (1808–1816)-Meiner,  F (2006).pdf",
        "/content/drive/MyDrive/بریده کتابها/پرونده هگل/(Philosophische Bibliothek_ 171b-d) Hegel, Georg Wilhelm Friedrich - Vorlesungen über die Philosophie der Weltgeschichte. Band II–IV_ Die orientalische Welt. Die griechische und die römische Welt. Die.pdf"]
    main(file_paths)

**what books is on astradb?**

In [None]:
# Install necessary libraries (run these in a Colab cell)
#!pip install astrapy

from astrapy.db import AstraDB, AstraDBCollection
from astrapy import DataAPIClient
from google.colab import userdata
import json

# --- Astra DB Connection Setup ---

ASTRA_DB_APPLICATION_TOKEN = userdata.get("PHILO_API")
ASTRA_DB_API_ENDPOINT = userdata.get("PHILO_END")
ASTRA_DB_COLLECTION_NAME = "philosophical_texts"

def connect_to_astra():
    """Connects to Astra DB using DataAPIClient."""
    try:
        token = ASTRA_DB_APPLICATION_TOKEN
        api_endpoint = ASTRA_DB_API_ENDPOINT
        client = DataAPIClient(token)
        database = client.get_database(api_endpoint)
        print(f"Connected to Astra DB.")
        return database
    except Exception as e:
        print(f"Error connecting to Astra DB: {e}")
        return None

def get_column_names(database):
    """Retrieves column names from the first document (assumes consistent schema)."""
    if database is None:
        print("Database connection is not established.")
        return []
    try:
        collection = database.get_collection(ASTRA_DB_COLLECTION_NAME)
        first_doc = collection.find_one()
        if first_doc:
            return list(first_doc.keys())
        else:
            print("Collection is empty. Cannot determine column names.")
            return []
    except Exception as e:
        print(f"Error retrieving column names: {e}")
        return []

def get_unique_books_data(database, column_names):
    """Retrieves unique book data (doc_name and other metadata)."""
    if database is None:
        print("Database connection is not established.")
        return []

    if 'metadata' not in column_names:
        print("The 'metadata' field does not exist in the collection.")
        return []

    try:
        collection = database.get_collection(ASTRA_DB_COLLECTION_NAME)
        distinct_books = []
        processed_names = set()  # Track processed doc_names

        for doc in collection.find(projection={"metadata": 1}):
            if (doc and "metadata" in doc and isinstance(doc["metadata"], dict) and
                    "doc_name" in doc["metadata"]):
                doc_name = doc["metadata"]["doc_name"]
                if doc_name is not None and doc_name not in processed_names:
                    book_data = {
                        "doc_name": doc_name,
                        "references": doc["metadata"].get("references", "N/A"),
                        "keywords": doc["metadata"].get("keywords", "N/A"),
                        "summary": doc["metadata"].get("summary", "N/A"),
                        "chunk_number": doc["metadata"].get("chunk_number", "N/A"),
                    }
                    distinct_books.append(book_data)
                    processed_names.add(doc_name)

        return distinct_books
    except Exception as e:
        print(f"Error retrieving book data: {e}")
        return []


def format_for_html_hover(books):
    """Formats book data for HTML with hover details."""
    if not books:
        return '<ul id="book-list"><li>No books found.</li></ul>'

    html_output = '<ul id="book-list">\n'
    for index, book in enumerate(books):
        html_output += f'  <li class="book-item">\n'
        html_output += f'    <span class="book-title">{book["doc_name"]}</span>\n'
        html_output += f'    <span class="book-count">{index + 1}</span>\n'
        html_output += f'    <div class="book-details">\n'
        html_output += f'      <p><strong>References:</strong> {book["references"]}</p>\n'
        html_output += f'      <p><strong>Keywords:</strong> {book["keywords"]}</p>\n'
        html_output += f'      <p><strong>Summary:</strong> {book["summary"]}</p>\n'
        html_output += f'      <p><strong>Chunk Number:</strong> {book["chunk_number"]}</p>\n'
        html_output += '    </div>\n'
        html_output += '  </li>\n'
    html_output += '</ul>'
    return html_output


# --- Main Execution ---

if __name__ == "__main__":
    database = connect_to_astra()
    if database:
        column_names = get_column_names(database)
        print("Column Names:", column_names)

        books_data = get_unique_books_data(database, column_names)

        # --- HTML Formatting ---
        html_output = format_for_html_hover(books_data)

        print("\nFormatted HTML Output:")
        print(html_output)

        print("\n--- Instructions ---")
        print("1. Copy the *ENTIRE* Formatted HTML Output above.")
        print("2. Paste it inside the  `<div id=\"existing-books\" ...>`  element in your HTML,")
        print("   REPLACING the existing content within that div.")
        print("3. Ensure you have the CSS styles from the previous response in your HTML's `<style>` tag.")

Connected to Astra DB.
Column Names: ['_id', 'content', 'metadata']

Formatted HTML Output:
<ul id="book-list">
  <li class="book-item">
    <span class="book-title">Georg Wilhelm Friedrich Hegel, Rolf-Peter Horstmann - Jenaer Systementwürfe III_ Naturphilosophie und Philosophie des Geistes (Philosophische Bibliothek)-F. Meiner (1986)</span>
    <span class="book-count">1</span>
    <div class="book-details">
      <p><strong>References:</strong> 3-7873-0684-6</p>
      <p><strong>Keywords:</strong> ['Here are 7 key keywords extracted from the text:\n\n1. Selbsterhalt\n2. Einheit\n3. Realität\n4. Kristall\n5. Gegenständlichkeit\n6. Selbstzweck (力其自身的)\n7. Wirklichkeit']</p>
      <p><strong>Summary:</strong> Here is a concise summary of the text:

The text describes a philosophical metaphor for the diamond crystal, which is considered the fundamental substance, the "diamant der Rande", or the "external unity" at its core. This substance is a marriage of simplicity, clarity, and substa

In [None]:
# Install necessary libraries (run these in a Colab cell)
!pip install astrapy sentence-transformers

from astrapy.db import AstraDB, AstraDBCollection
from astrapy import DataAPIClient
from google.colab import userdata
import json
#NEW
from sentence_transformers import SentenceTransformer
import re
from typing import List

# --- Astra DB Connection Setup ---

ASTRA_DB_APPLICATION_TOKEN = userdata.get("PHILO_API")
ASTRA_DB_API_ENDPOINT = userdata.get("PHILO_END")
ASTRA_DB_COLLECTION_NAME = "philosophical_texts"

# --- Model Setup (for summarization and keyword filtering) ---
# Use a multilingual model.  'paraphrase-multilingual-MiniLM-L12-v2' is a good balance of size/performance.
# You can explore other models: https://www.sbert.net/docs/pretrained_models.html
SUMMARY_MODEL_NAME = 'paraphrase-multilingual-MiniLM-L12-v2'
summary_model = SentenceTransformer(SUMMARY_MODEL_NAME)

def connect_to_astra():
    """Connects to Astra DB using DataAPIClient."""
    try:
        token = ASTRA_DB_APPLICATION_TOKEN
        api_endpoint = ASTRA_DB_API_ENDPOINT
        client = DataAPIClient(token)
        database = client.get_database(api_endpoint)
        print(f"Connected to Astra DB.")
        return database
    except Exception as e:
        print(f"Error connecting to Astra DB: {e}")
        return None

def get_column_names(database):
    """Retrieves column names from the first document (assumes consistent schema)."""
    if database is None:
        print("Database connection is not established.")
        return []
    try:
        collection = database.get_collection(ASTRA_DB_COLLECTION_NAME)
        first_doc = collection.find_one()
        if first_doc:
            return list(first_doc.keys())
        else:
            print("Collection is empty. Cannot determine column names.")
            return []
    except Exception as e:
        print(f"Error retrieving column names: {e}")
        return []


def is_mostly_english(text: str) -> bool:
    """Checks if a string is mostly English characters (ASCII)."""
    if not text:
        return True  # Consider empty strings as English
    try:
        text.encode('ascii')
        # Count non-ASCII characters.  If > 20% are non-ASCII, it's probably not mostly English.
        non_ascii_count = sum(1 for char in text if ord(char) > 127)
        return (non_ascii_count / len(text)) <= 0.20
    except UnicodeEncodeError:
        return False

def filter_keywords(keywords_list: List[str]) -> List[str]:
    """Filters a list of keywords, removing non-English and problematic entries."""
    if not keywords_list:
        return []

    filtered_keywords = []
    for keyword in keywords_list:
        keyword_str = str(keyword).strip()  # Ensure it's a string and clean
        if keyword_str and is_mostly_english(keyword_str):
            # Remove entries that are likely just noise (e.g., single characters, punctuation)
            if len(keyword_str) > 1 and keyword_str.isalnum():  # Check if alphanumeric
                filtered_keywords.append(keyword_str)
    return filtered_keywords

def generate_persian_summary(text: str, book_title: str) -> str:
    """
    Generates a concise and informative Persian summary of the given text,
    incorporating the book title. Uses sentence-transformers for embeddings,
    and selects the most relevant sentences.
    """
    if not text:
        return "خلاصه‌ای موجود نیست."

    try:
        # 1. Split the text into sentences (Persian-aware).
        sentences = re.split(r'(?<=[.!?。؟])\s+', text)  # Split on ., !, ?, 。(Chinese full stop), ؟ (Persian question mark)

        # 2. Remove very short sentences (likely noise).
        sentences = [s for s in sentences if len(s.split()) >= 3]
        if not sentences:
            return "خلاصه‌ای موجود نیست."

        # 3. Create sentence embeddings.
        sentence_embeddings = summary_model.encode(sentences)

        # 4. Create an embedding for the book title (and context, if available).
        context = f"خلاصه‌ای از کتاب: {book_title}"  # Use book title as context
        context_embedding = summary_model.encode(context)

        # 5. Calculate cosine similarity between context and each sentence.
        from sklearn.metrics.pairwise import cosine_similarity
        similarities = cosine_similarity(context_embedding.reshape(1, -1), sentence_embeddings)[0]

        # 6. Select the top N most similar sentences.  Adjust N as needed.
        top_n = 3  # Start with 3 sentences.  Increase if summaries are too short.
        top_indices = similarities.argsort()[-top_n:][::-1]
        top_sentences = [sentences[i] for i in top_indices]

        # 7. Join the sentences and return.
        return " ".join(top_sentences)

    except Exception as e:
        print(f"Error during summarization: {e}")
        return "خطا در تولید خلاصه."


def get_unique_books_data(database, column_names):
    """Retrieves, processes, and enhances book data."""
    if database is None:
        return []
    if 'metadata' not in column_names:
        print("The 'metadata' field does not exist.")
        return []

    try:
        collection = database.get_collection(ASTRA_DB_COLLECTION_NAME)
        distinct_books = []
        processed_names = set()

        for doc in collection.find(projection={"metadata": 1, "content": 1}): # Fetch content too
            if (doc and "metadata" in doc and isinstance(doc["metadata"], dict) and "doc_name" in doc["metadata"]):
                doc_name = doc["metadata"]["doc_name"]

                if doc_name is not None and doc_name not in processed_names:
                    # --- Data Cleaning and Enhancement ---
                    references = doc["metadata"].get("references", "N/A")
                    if references == "" or references is None:
                        references = "N/A"

                    keywords = doc["metadata"].get("keywords", [])
                    if isinstance(keywords, str): # if it is str
                        keywords = [keywords]

                    filtered_keywords = filter_keywords(keywords)
                    keywords_str = ", ".join(filtered_keywords) if filtered_keywords else "N/A"

                    # Use the content for summarization if available, otherwise the existing summary
                    content_to_summarize = doc.get("content", doc["metadata"].get("summary", ""))
                    persian_summary = generate_persian_summary(content_to_summarize, doc_name)
                    # --- End Data Cleaning ---
                    book_data = {
                        "doc_name": doc_name,
                        "references": references,
                        "keywords": keywords_str,
                        "summary": persian_summary,  # Use generated summary
                        "chunk_number": doc["metadata"].get("chunk_number", "N/A"),
                    }
                    distinct_books.append(book_data)
                    processed_names.add(doc_name)
        return distinct_books

    except Exception as e:
        print(f"Error retrieving/processing book data: {e}")
        return []



def format_for_html_hover(books):
    """Formats book data for HTML with hover details."""
    if not books:
        return '<ul id="book-list"><li>No books found.</li></ul>'

    html_output = '<ul id="book-list">\n'
    for index, book in enumerate(books):
        html_output += f'  <li class="book-item">\n'
        html_output += f'    <span class="book-title">{book["doc_name"]}</span>\n'
        html_output += f'    <span class="book-count">{index + 1}</span>\n'
        html_output += f'    <div class="book-details">\n'
        html_output += f'      <p><strong>References:</strong> {book["references"]}</p>\n'
        html_output += f'      <p><strong>Keywords:</strong> {book["keywords"]}</p>\n'
        html_output += f'      <p><strong>Summary:</strong> {book["summary"]}</p>\n'
        html_output += f'      <p><strong>Chunk Number:</strong> {book["chunk_number"]}</p>\n'
        html_output += '    </div>\n'
        html_output += '  </li>\n'
    html_output += '</ul>'
    return html_output


# --- Main Execution ---

if __name__ == "__main__":
    database = connect_to_astra()
    if database:
        column_names = get_column_names(database)
        print("Column Names:", column_names)

        books_data = get_unique_books_data(database, column_names)

        # --- HTML Formatting ---
        html_output = format_for_html_hover(books_data)

        print("\nFormatted HTML Output:")
        print(html_output)

        print("\n--- Instructions ---")
        print("1. Copy the *ENTIRE* Formatted HTML Output above.")
        print("2. Paste it inside the  `<div id=\"existing-books\" ...>`  element in your HTML,")
        print("   REPLACING the existing content within that div.")
        print("3. Ensure you have the CSS styles (dark theme) in your HTML's `<style>` tag.")



Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Connected to Astra DB.
Column Names: ['_id', 'content', 'metadata']

Formatted HTML Output:
<ul id="book-list">
  <li class="book-item">
    <span class="book-title">Georg Wilhelm Friedrich Hegel, Rolf-Peter Horstmann - Jenaer Systementwürfe III_ Naturphilosophie und Philosophie des Geistes (Philosophische Bibliothek)-F. Meiner (1986)</span>
    <span class="book-count">1</span>
    <div class="book-details">
      <p><strong>References:</strong> 3-7873-0684-6</p>
      <p><strong>Keywords:</strong> N/A</p>
      <p><strong>Summary:</strong> Copyrighted material Naturphilosophie 76-78 
dieser Kristall eben von jener  Einheit  durchdrungen, ist die  auf­
gelöste  /Materie; es ist die in  sich  selbst zersetzte Substanz. Verdopp­
lung 
2 Am  Rande: reine Abstraktionen  Einzelnheit; 
1 Am Rande: a)  zurückgekehrte  Gestalt, Einfachheit, Selbst­
gleichheit, Unmittelbarkeit des Seins. Sie als die einfache Qualitativität,  das einfache daseiende  Insich­
sein , ist das reine kalte Licht,  u

In [None]:
# Install necessary libraries
!pip install astrapy requests beautifulsoup4 lxml

from astrapy.db import AstraDB, AstraDBCollection
from astrapy import DataAPIClient
from google.colab import userdata
import requests
from bs4 import BeautifulSoup
import json
import re  # Import the regular expression module

# --- Astra DB Connection Setup ---

ASTRA_DB_APPLICATION_TOKEN = userdata.get("PHILO_API")
ASTRA_DB_API_ENDPOINT = userdata.get("PHILO_END")
ASTRA_DB_COLLECTION_NAME = "philosophical_texts"

def connect_to_astra():
    """Connects to Astra DB."""
    try:
        token = ASTRA_DB_APPLICATION_TOKEN
        api_endpoint = ASTRA_DB_API_ENDPOINT
        client = DataAPIClient(token)
        database = client.get_database(api_endpoint)
        print(f"Connected to Astra DB.")
        return database
    except Exception as e:
        print(f"Error connecting to Astra DB: {e}")
        return None

def get_existing_book_names(database):
    """Gets a set of existing book names from Astra DB."""
    if database is None:
        return set()

    try:
        collection = database.get_collection(ASTRA_DB_COLLECTION_NAME)
        existing_names = set()
        for doc in collection.find(projection={"metadata.doc_name": 1}):
            if doc and "metadata" in doc and isinstance(doc["metadata"], dict) and "doc_name" in doc["metadata"]:
                doc_name = doc["metadata"]["doc_name"]
                if doc_name:
                    existing_names.add(doc_name.lower())  # Store lowercase for case-insensitive comparison
        return existing_names
    except Exception as e:
        print(f"Error retrieving existing book names: {e}")
        return set()
def scrape_hegel_books(url):
    """
    Scrapes a website for Hegel's books, handling potential errors and variations.
    Returns a set of book titles (lowercased).
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)

        soup = BeautifulSoup(response.content, 'lxml')  # Use lxml for more robust parsing
        book_titles = set()

        # Adapt based on the website structure.  Here are some common patterns:
        # Example 1:  Titles in <h2> tags, within a specific div
        content_div = soup.find('div', class_='content')  # Replace 'content' with the actual class
        if content_div:
            for h2 in content_div.find_all('h2'):
                title = h2.text.strip()
                if title:
                    book_titles.add(title.lower())

        # Example 2: Titles in <li> tags within a <ul> with a specific ID
        book_list = soup.find('ul', id='bookList')  # Replace 'bookList' with the actual ID
        if book_list:
            for li in book_list.find_all('li'):
                title = li.text.strip()
                if title:
                    book_titles.add(title.lower())

        # Example 3: Titles in <a> tags within a table
        table = soup.find('table')
        if table:
            for a in table.find_all('a'):
                title = a.text.strip()
                if title:
                    book_titles.add(title.lower())

        # Example 4: Using regular expressions to find titles within a <p> tag
        #  This is useful if there's no consistent HTML structure.
        for p in soup.find_all('p'):
          text = p.get_text()
          # Match patterns like "Book Title (Year)" or "Book Title - Subtitle"
          matches = re.findall(r"([A-Z][a-zA-Z\s',.-]+)(?:\s\((\d{4})\))?", text)  # Improved regex
          for match in matches:
              # match[0] is the title, match[1] is the year (if present)
              book_titles.add(match[0].strip().lower())

        # Example 5 (Felix Meiner Verlag - Specific):
        if "meiner.de" in url:  # More reliable than checking the URL directly
            #print("Using Meiner specific scraping") #Debug
            # Find the main product listing div
            product_list = soup.find('div', class_="products-listing")
            if product_list:
                # Find all product items
                for item in product_list.find_all('div', class_="product-item"):
                    # Extract title
                    title_element = item.find('a', class_="product-item__title") #class might be incorrect
                    if title_element:
                        title = title_element.text.strip()
                        book_titles.add(title.lower())


        return book_titles

    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return set()
    except Exception as e:
        print(f"Scraping error: {e}")
        return set()

def find_new_books(existing_books, scraped_books):
    """Compares scraped book titles with existing ones and returns new titles."""
    return scraped_books - existing_books

def main():
    database = connect_to_astra()
    if not database:
        return

    existing_books = get_existing_book_names(database)
    print(f"Found {len(existing_books)} existing books in Astra DB.")

    # --- List of URLs to scrape ---
    urls_to_scrape = [
        "https://www.projekt-gutenberg.org/autoren/namen/hegel.html",
        "https://www.fischerverlage.de/autor/georg-wilhelm-friedrich-hegel-100486",
        "https://meiner.de/personen/hegel-georg-wilhelm-friedrich-500000185/",
        "https://www.suhrkamp.de/autoren/georg_wilhelm_friedrich_hegel_1655.html",
      # Add other relevant URLs here
    ]

    all_scraped_books = set()
    for url in urls_to_scrape:
        print(f"Scraping {url}...")
        scraped_books = scrape_hegel_books(url)
        print(f"  Found {len(scraped_books)} potential books.")
        all_scraped_books.update(scraped_books)  # Add to the combined set


    new_books = find_new_books(existing_books, all_scraped_books)

    print("\n--- Results ---")
    if new_books:
        print(f"Found {len(new_books)} new books by Hegel:")
        for book in new_books:
            print(f"  - {book}")  # Print lowercase for consistency
        print("\nThese books are not currently in your Astra DB collection.")
        print("Consider adding them to your vector database.")
    else:
        print("No new books by Hegel found that are not already in your collection.")

if __name__ == "__main__":
    main()

Connected to Astra DB.
Found 12 existing books in Astra DB.
Scraping https://www.projekt-gutenberg.org/autoren/namen/hegel.html...
  Found 41 potential books.
Scraping https://www.fischerverlage.de/autor/georg-wilhelm-friedrich-hegel-100486...
Request error: 404 Client Error: Not Found for url: https://www.fischerverlage.de/autor/georg-wilhelm-friedrich-hegel-100486
  Found 0 potential books.
Scraping https://meiner.de/personen/hegel-georg-wilhelm-friedrich-500000185/...
Request error: 404 Client Error: Not Found for url: https://meiner.de/personen/hegel-georg-wilhelm-friedrich-500000185/
  Found 0 potential books.
Scraping https://www.suhrkamp.de/autoren/georg_wilhelm_friedrich_hegel_1655.html...
Request error: 404 Client Error: Not Found for url: https://www.suhrkamp.de/person/georg-wilhelm-friedrich-hegel-p-1655
  Found 0 potential books.

--- Results ---
Found 41 new books by Hegel:
  - philosophie an der berliner universit
  - jena
  - shop
  - wgs
  - frankfurt am main, arbeitete

In [None]:
import os
import json
import uuid
from pathlib import Path
from typing import Any, List, Tuple, Optional
from dataclasses import dataclass
from pypdf import PdfReader
from langchain.docstore.document import Document
from langchain.vectorstores import VectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from astrapy import DataAPIClient
import re
from google.colab import userdata
from astrapy.constants import VectorMetric
from astrapy.database import Database
from astrapy.collection import Collection
from astrapy.exceptions import CollectionAlreadyExistsException
import time
import google.generativeai as genai

# Constants and configurations
TEXT_FILE_TYPES = ["txt", "docx", "pdf"]
ASTRA_DB_APPLICATION_TOKEN = userdata.get("PHILO_API")
ASTRA_DB_API_ENDPOINT = userdata.get("PHILO_END")
GEMINI_API_KEY = userdata.get("GOOGLE_API_KEY_BHR1").strip()
GEMINI_EMBEDDING_MODEL_NAME = "models/text-embedding-004"
GEMINI_TEXT_MODEL_NAME = "gemini-1.5-flash-8b"

class PDFLoader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load(self) -> List[Document]:
        try:
            # Use PyPDF directly instead of PyPDFLoader
            pdf_reader = PdfReader(open(self.file_path, 'rb'))
            documents = []

            for page in pdf_reader.pages:
                text = page.extract_text()
                doc = Document(
                    page_content=text,
                    metadata={'source': self.file_path, 'page': len(documents) + 1}
                )
                documents.append(doc)

            return documents

        except Exception as e:
            print(f"Error loading PDF: {e}")
            return []

@dataclass
class Data:
    content: str = ""
    references: str = ""

class DocumentProcessor:
    def __init__(self, path: str, silent_errors: bool = False):
        self.path = path
        self.silent_errors = silent_errors
        self.status = ""

    def resolve_path(self, path: str) -> str:
        return os.path.abspath(path)

    def load_file(self) -> Tuple[Data, str]:
        if not self.path:
            raise ValueError("Please upload a file to use this component.")

        resolved_path = self.resolve_path(self.path)
        extension = Path(resolved_path).suffix[1:].lower()

        if extension not in TEXT_FILE_TYPES:
            raise ValueError(f"Unsupported file type: {extension}")

        if extension == "docx":
            from langchain.document_loaders import Docx2txtLoader
            loader = Docx2txtLoader(resolved_path)
        elif extension == "pdf":
            # Use custom PDFLoader
            loader = PDFLoader(resolved_path)
        else:  # Treat as text file
            from langchain.document_loaders import TextLoader
            loader = TextLoader(resolved_path)

        data_list = loader.load()

        # Ensure data_list contains the expected content
        if isinstance(data_list, list) and len(data_list) > 0:
            # Combine text from all pages for PDFs
            combined_text = " ".join([doc.page_content for doc in data_list])
            data = Data(content=combined_text)
            return data, Path(resolved_path).stem
        else:
            return Data(), ""

class RecursiveCharacterTextSplitterComponent:
    def __init__(self):
        # Modified to split into smaller, more manageable chunks
        self.splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", "\n", ". ", " "],  # More granular separators
            chunk_size=5000,  # Smaller chunk size
            chunk_overlap=200  # Some overlap to maintain context
        )

    def split_text(self, text: str) -> List[str]:
        return self.splitter.split_text(text)
class GeminiEmbeddingsComponent:
    def __init__(self, gemini_api_key: str):
        genai.configure(api_key=gemini_api_key)
        self.embedding_model = "models/text-embedding-004"
        self.text_model = "gemini-1.5-pro"
        self.max_retries = 3
        self.retry_delay = 60  # seconds

    def extract_keywords(self, text: str) -> List[str]:
        """Extract keywords with retry mechanism."""
        for attempt in range(self.max_retries):
            try:
                model = genai.GenerativeModel(self.text_model)
                prompt = f"""Extract 5-7 most significant keywords from the text:
                {text[:1000]}"""  # Limit text to prevent rate limiting

                response = model.generate_content(prompt)
                keywords = [k.strip() for k in response.text.split(',') if k.strip()]
                return keywords
            except Exception as e:
                print(f"Keyword extraction attempt {attempt + 1} failed: {e}")
                if 'quota' in str(e).lower():
                    print(f"Waiting {self.retry_delay} seconds before retry...")
                    time.sleep(self.retry_delay)
                else:
                    break
        return []

    def generate_chunk_summary(self, text: str) -> str:
        """Generate summary with retry mechanism."""
        for attempt in range(self.max_retries):
            try:
                model = genai.GenerativeModel(self.text_model)
                prompt = f"""Provide a concise summary of the text:
                {text[:1000]}"""  # Limit text to prevent rate limiting

                response = model.generate_content(prompt)
                return response.text.strip()
            except Exception as e:
                print(f"Summary generation attempt {attempt + 1} failed: {e}")
                if 'quota' in str(e).lower():
                    print(f"Waiting {self.retry_delay} seconds before retry...")
                    time.sleep(self.retry_delay)
                else:
                    break
        return ""

    def build_embeddings(self, texts: List[str], expected_dim: int = 768) -> List[List[float]]:
        embeddings = []
        batch_size = 2  # Further reduced batch size

        # Early exit if texts is empty
        if not texts:
            print("No texts provided for embedding")
            return []

        # Truncate or sample texts if the list is too large
        if len(texts) > 50:
            print(f"Large number of texts detected. Sampling {50} texts.")
            texts = texts[:50]

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            retry_count = 0

            while retry_count < self.max_retries:
                try:
                    # Ensure each text is not too long
                    truncated_batch = [text[:1000] for text in batch]

                    result = genai.embed_content(
                        model=self.embedding_model,
                        content=truncated_batch,
                        task_type="retrieval_document"
                    )

                    # Check if 'embedding' is in result and is a list
                    if 'embedding' in result and result['embedding']:
                        for embedding in result['embedding']:
                            if len(embedding) == expected_dim:
                                embeddings.append(embedding)
                            else:
                                print(f"Warning: Skipping embedding with incorrect dimension")

                        if embeddings:
                            break  # Successful batch processing
                    else:
                        print("No embeddings returned")
                        retry_count += 1

                except Exception as e:
                    retry_count += 1
                    print(f"Embedding error (attempt {retry_count}): {e}")
                    if 'quota' in str(e).lower():
                        print(f"Waiting {self.retry_delay} seconds before retry...")
                        time.sleep(self.retry_delay)
                    else:
                        print("Breaking out of retry loop due to non-quota error")
                        break

            # Pause between batches to reduce rate limit pressure
            time.sleep(5)

        # Final check to ensure embeddings were generated
        if not embeddings:
            print("WARNING: No embeddings could be generated")

        return embeddings

class AstraDBManager:
    def __init__(self, api_endpoint: str, token: str, collection_name: str, namespace: Optional[str] = None):
        self.api_endpoint = api_endpoint
        self.token = token
        self.collection_name = collection_name
        self.namespace = namespace or "default_namespace"
        self.client = DataAPIClient(token)
        self.database = self.client.get_database(api_endpoint)


    def get_or_create_collection(self, collection_name: str, dimension: int = 768):
        try:
            print(f"Checking for collection {collection_name} in database.")
            collections = self.database.list_collections()
            if collection_name in collections:
                print(f"* Collection {collection_name} already exists.")
                return self.database.get_collection(collection_name)
            else:
                print(f"* Collection {collection_name} does not exist. Creating...")
                collection = self.database.create_collection(
                    name=collection_name,
                    dimension=dimension,
                    metric=VectorMetric.COSINE,
                )
                print(f"* Collection {collection_name} created successfully.")
                return collection
        except CollectionAlreadyExistsException:
            print(f"* Collection {collection_name} already exists. Skipping creation.")
            return self.database.get_collection(collection_name)
        except Exception as e:
            print(f"Error handling collection {collection_name}: {e}")
            raise
    def add_documents(self, embeddings: List[List[float]], doc_name: str, references: str,
                  chunks: List[str], keywords_list: List[List[str]], summaries: List[str]):
        # Convert embeddings to list if not already
        if not isinstance(embeddings[0], list):
            embeddings = [list(emb) for emb in embeddings]

        collection = self.get_or_create_collection(collection_name=self.collection_name)

        insertion_errors = 0
        successful_insertions = 0

        # Ensure consistent list lengths
        min_length = min(len(embeddings), len(chunks), len(keywords_list), len(summaries))

        for index in range(min_length):
            try:
                # Validate embedding
                if len(embeddings[index]) != 768:
                    print(f"Warning: Skipping document {index} - incorrect embedding dimension")
                    continue

                # Split large content into smaller chunks if needed
                content_chunks = [chunks[index][i:i+8000] for i in range(0, len(chunks[index]), 8000)]

                for content_chunk in content_chunks:
                    doc_id = str(uuid.uuid4())
                    document = {
                        "_id": doc_id,
                        "content": content_chunk,
                        "$vector": embeddings[index],
                        "metadata": {
                            "doc_name": doc_name,
                            "references": references,
                            "keywords": keywords_list[index] if keywords_list else [],
                            "summary": summaries[index] if summaries else "",
                            "chunk_number": content_chunks.index(content_chunk)
                        }
                    }

                    result = collection.insert_one(document)

                    if result.inserted_id:
                        successful_insertions += 1
                        print(f"Successfully inserted document {doc_id}")
                    else:
                        insertion_errors += 1
                        print(f"Failed to insert document {doc_id}")

            except Exception as e:
                insertion_errors += 1
                print(f"Insertion error for document {index}: {e}")

        print(f"Insertion Summary:")
        print(f"Total documents: {min_length}")
        print(f"Successful insertions: {successful_insertions}")
        print(f"Failed insertions: {insertion_errors}")



def main(file_paths: List[str]):
    text_splitter = RecursiveCharacterTextSplitterComponent()
    embeddings_component = GeminiEmbeddingsComponent(gemini_api_key=GEMINI_API_KEY)
    astradb_manager = AstraDBManager(
        api_endpoint=ASTRA_DB_API_ENDPOINT,
        token=ASTRA_DB_APPLICATION_TOKEN,
        collection_name="philosophical_texts"
    )

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        try:
            processor = DocumentProcessor(file_path)
            data, doc_name = processor.load_file()

            # Validate data
            if not data.content:
                print(f"Warning: No content extracted from {file_path}")
                continue

            # Split into paragraphs
            chunks = text_splitter.split_text(data.content)
            print(f"Total chunks extracted: {len(chunks)}")

            # Safety checks
            if not chunks:
                print("Error: No chunks extracted from the document")
                continue

            # Embeddings for paragraphs
            embeddings = embeddings_component.build_embeddings(chunks)

            # Check if embeddings were generated
            if not embeddings:
                print("No embeddings could be generated. Skipping document.")
                continue

            # Extract keywords and summaries
            keywords_list = [embeddings_component.extract_keywords(chunk) for chunk in chunks[:len(embeddings)]]
            summaries = [embeddings_component.generate_chunk_summary(chunk) for chunk in chunks[:len(embeddings)]]

            # Extract references
            references = extract_references(data.content)

            # Send to AstraDB
            astradb_manager.add_documents(embeddings, doc_name, references, chunks[:len(embeddings)], keywords_list, summaries)
            print(f"Finished processing file: {file_path}")

        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue

def extract_references(text: str) -> str:
    pattern = r'\d+-\d+-\d+-\d+'
    matches = re.findall(pattern, text)
    return matches[0] if matches else "No references"

def main(file_paths: List[str]):
    text_splitter = RecursiveCharacterTextSplitterComponent()
    embeddings_component = GeminiEmbeddingsComponent(gemini_api_key=GEMINI_API_KEY)
    astradb_manager = AstraDBManager(
        api_endpoint=ASTRA_DB_API_ENDPOINT,
        token=ASTRA_DB_APPLICATION_TOKEN,
        collection_name="philosophical_texts"
    )

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        processor = DocumentProcessor(file_path)
        data, doc_name = processor.load_file()

        # Split into paragraphs
        chunks = text_splitter.split_text(data.content)

        # Embeddings for paragraphs
        embeddings = embeddings_component.build_embeddings(chunks)

        # Extract keywords and summaries
        keywords_list = [embeddings_component.extract_keywords(chunk) for chunk in chunks]
        summaries = [embeddings_component.generate_chunk_summary(chunk) for chunk in chunks]

        # Extract references
        references = extract_references(data.content)

        # Send to AstraDB
        astradb_manager.add_documents(embeddings, doc_name, references, chunks, keywords_list, summaries)
        print(f"Finished processing file: {file_path}")

if __name__ == "__main__":
    file_paths = [
        "/content/drive/MyDrive/(Philosophische Bibliothek 385) Georg Wilhelm Friedrich Hegel-Wissenschaft der Logik-Meiner (2008).pdf"
    ]
    main(file_paths)