In [1]:
import os
from pathlib import Path
import ollama
import pypdf
import weaviate
from weaviate.connect import ConnectionParams
from weaviate.classes.init import AdditionalConfig, Timeout, Auth
import weaviate.classes.config as wc
import weaviate.classes.query as wq
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
load_dotenv(override=True)

True

In [3]:
client_weaviate = weaviate.WeaviateClient(
    connection_params=ConnectionParams.from_params(
        http_host="weaviate",
        http_port="8081",
        http_secure=False,
        grpc_host="weaviate",
        grpc_port="50051",
        grpc_secure=False,
    ),
    additional_config=AdditionalConfig(
        timeout=Timeout(init=30, query=60, insert=120),  # Values in seconds
    ),
    skip_init_checks=False
)

client_weaviate.connect()  # When directly instantiating, you need to connect manually

client_weaviate.is_ready()

True

In [4]:
client_weaviate.collections.delete("FCA")
client_weaviate.collections.create(
    name="FCA",
    properties=[
        wc.Property(name="idx", data_type=wc.DataType.INT, skip_vectorization=True),
        wc.Property(name="text", data_type=wc.DataType.TEXT),
    ],
    # Define the vectorizer module
    vectorizer_config=[
        wc.Configure.NamedVectors.text2vec_ollama(
            name="ollama_vectorizer",
            api_endpoint="http://ollama:11434",
            model="nomic-embed-text",
        )
    ],
)

<weaviate.collections.collection.sync.Collection at 0x7fe522a58610>

In [5]:
def extract_text_from_pdf(pdf_file_name: Path) -> str:
    pdf_file = pypdf.PdfReader(pdf_file_name)
    return " ".join((page.extract_text() for page in pdf_file.pages))

In [6]:
def chunk_text(input_text: str, chunk_size: int, chunk_overlap: int) -> list:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.create_documents([input_text])

In [7]:
root_dir = Path(os.getenv("PDF_DIR"))
print(root_dir)
    
chunk_size = int(os.getenv("CHUNK_SIZE"))
chunk_overlap = int(os.getenv("CHUNK_OVERLAP"))

collection = client_weaviate.collections.get("FCA")

for pdf_file_name in root_dir.glob("*.pdf"):
    print(f"Processing {pdf_file_name.name}...")
    text = extract_text_from_pdf(pdf_file_name)
    chunked_text = chunk_text(text, chunk_size, chunk_overlap)
    for idx, chunk in enumerate(chunked_text):
        content = chunk.page_content
        collection.data.insert(
            {
                "idx": idx,
                "title": pdf_file_name.name,
                "text": content,
            },
        )
print("FINISHED - create embeddings")

/app/PDF
Processing FCA1327-26_ADS_SysAdmin.pdf...
FINISHED - create embeddings


In [8]:
collection = client_weaviate.collections.get("FCA")
print(f"Inserted {len(collection)} embeddings")

Inserted 1971 embeddings


In [9]:
def get_near_vectors(collection_name: str, query_text: str, limit: int):
    collection = client_weaviate.collections.get(collection_name)
    return collection.query.near_text(
        query=query_text,
        limit=limit,
        return_metadata=wq.MetadataQuery(distance=True),
        return_properties=["idx", "text", "title", ],
    )

In [10]:
near_vectors = get_near_vectors("FCA", "PermissionModelActive", 4)
context = [
    {"title": str(vector.properties["title"]), "snippet": vector.properties["text"]}
    for vector in near_vectors.objects
]

In [11]:
query = "Explain the setting PermissionModelActive. List the title of the documents."

prompt = f"""
## Task & Context
You help people answer their questions and other requests interactively. You will
be asked a very wide array of requests on all kinds of topics. You will be
equipped with a wide range of search engines or similar tools to help you,
together with snippets from documents. You will use these to research your answer.
You should focus on serving the user's needs as best you can.

## Style Guide
Unless the user asks for a different style of answer, you should answer in
full sentences, using proper grammar and spelling.

## Use these documents: {context}. Respond to this prompt: {query}. Cite the 
title of the documents you used for the answer.
"""

#client = Client(host='http://ollama:11434')

output = ollama.generate(
  model="mistral",
  prompt=prompt,
)

print(output['response'])

 The `PermissionModelActive` setting is not explicitly mentioned in the provided documents. However, I can explain its possible context based on the snippets you've provided.

   In the Adobe Campaign Automation (ADS) system, permissions are managed to control user access to various resources. The command `entitlement_enable_all` is used to grant certain permissions for specified users across all servers. This suggests that settings like `PermissionModelActive` might be related to enabling or disabling permission models, allowing or denying specific actions on the ADM database.

   I've used the following documents for my answer:
   1. FCA1327-26_ADS_SysAdmin.pdf

   While I could not find the exact setting `PermissionModelActive` in these documents, I hope this explanation helps you better understand its possible context. For more precise information, I recommend checking the official documentation or contacting Adobe support directly.
