In [31]:
import os
from pathlib import Path
import ollama
import pypdf
import weaviate
from weaviate.connect import ConnectionParams
from weaviate.classes.init import AdditionalConfig, Timeout, Auth
import weaviate.classes.config as wc
import weaviate.classes.query as wq
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [32]:
load_dotenv()

True

In [33]:
client_weaviate = weaviate.WeaviateClient(
    connection_params=ConnectionParams.from_params(
        http_host="weaviate",
        http_port="8081",
        http_secure=False,
        grpc_host="weaviate",
        grpc_port="50051",
        grpc_secure=False,
    ),
    additional_config=AdditionalConfig(
        timeout=Timeout(init=30, query=60, insert=120),  # Values in seconds
    ),
    skip_init_checks=False
)

client_weaviate.connect()  # When directly instantiating, you need to connect manually

client_weaviate.is_ready()

True

In [34]:
client_weaviate.collections.delete("FCA")
client_weaviate.collections.create(
    name="FCA",
    properties=[
        wc.Property(name="idx", data_type=wc.DataType.INT, skip_vectorization=True),
        wc.Property(name="text", data_type=wc.DataType.TEXT),
    ],
    # Define the vectorizer module
    vectorizer_config=[
        wc.Configure.NamedVectors.text2vec_ollama(
            name="ollama_vectorizer",
            api_endpoint="http://ollama:11434",
            model="nomic-embed-text",
        )
    ],
)

<weaviate.collections.collection.sync.Collection at 0x7f6af02754c0>

In [35]:
def extract_text_from_pdf(pdf_file_name: Path) -> str:
    pdf_file = pypdf.PdfReader(pdf_file_name)
    return " ".join((page.extract_text() for page in pdf_file.pages))

In [36]:
def chunk_text(input_text: str, chunk_size: int, chunk_overlap: int) -> list:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.create_documents([input_text])

In [37]:
root_dir = Path(os.getenv("PDF_DIR"))
chunk_size = int(os.getenv("CHUNK_SIZE"))
chunk_overlap = int(os.getenv("CHUNK_OVERLAP"))

collection = client_weaviate.collections.get("FCA")

for pdf_file_name in root_dir.glob("*.pdf"):
    print(f"Processing {pdf_file_name.name}...")
    text = extract_text_from_pdf(pdf_file_name)
    chunked_text = chunk_text(text, chunk_size, chunk_overlap)
    for idx, chunk in enumerate(chunked_text):
        content = chunk.page_content
        collection.data.insert(
            {
                "idx": idx,
                "title": pdf_file_name.name,
                "text": content,
            },
        )
print("FINISHED - create embeddings")

FINISHED - create embeddings


In [38]:
collection = client_weaviate.collections.get("FCA")
print(f"Inserted {len(collection)} embeddings")

Inserted 0 embeddings


In [39]:
def get_near_vectors(collection_name: str, query_text: str, limit: int):
    collection = client_weaviate.collections.get(collection_name)
    return collection.query.near_text(
        query=query_text,
        limit=limit,
        return_metadata=wq.MetadataQuery(distance=True),
        return_properties=["idx", "text", "title", ],
    )

In [40]:
near_vectors = get_near_vectors("FCA", "PermissionModelActive", 4)
context = [
    {"title": str(vector.properties["title"]), "snippet": vector.properties["text"]}
    for vector in near_vectors.objects
]

WeaviateQueryError: Query call with protocol GRPC search failed with message <AioRpcError of RPC that terminated with:
	status = StatusCode.UNKNOWN
	details = "no such prop with name 'title' found in class 'FCA' in the schema. Check your schema files for which properties in this class are available"
	debug_error_string = "UNKNOWN:Error received from peer  {created_time:"2024-08-09T13:29:13.090576172+00:00", grpc_status:2, grpc_message:"no such prop with name \'title\' found in class \'FCA\' in the schema. Check your schema files for which properties in this class are available"}"
>.

In [12]:
query = "Explain the setting PermissionModelActive. List the title of the documents."

prompt = f"""
## Task & Context
You help people answer their questions and other requests interactively. You will
be asked a very wide array of requests on all kinds of topics. You will be
equipped with a wide range of search engines or similar tools to help you,
together with snippets from documents. You will use these to research your answer.
You should focus on serving the user's needs as best you can.

## Style Guide
Unless the user asks for a different style of answer, you should answer in
full sentences, using proper grammar and spelling.

## Use these documents: {context}. Respond to this prompt: {query}. Cite the 
title of the documents you used for the answer.
"""

output = ollama.generate(
  model="llama2",
  prompt=prompt,
)

print(output['response'])

Explaining the Setting PermissionModelActive:
PermissionModelActive is a configuration setting in Active Directory Management Services (ADMS) that controls which permission model must be initialized when starting ADMS. This setting is important because it determines whether ADS or AMS will be used for user permissions. The two models serve different purposes, with ADS providing more detailed control over user permissions and AMS providing a simpler, more intuitive interface.

List of Documents:

1. FCA1190-18.pdf
2. FCA1190-18.pdf
3. FCA1190-18.pdf

Cited Documents:
The answers provided in this prompt were based on the following documents:

* FCA1190-18.pdf: This document provides detailed information on the AMS and ADS models, including their differences and how they are used in ADMS. It also explains the PermissionModelActive setting and its impact on ADMS initialization.
* FCA1190-18.pdf: This document provides additional information on the PermissionModelActive setting, including h