In [1]:
import os
from pathlib import Path

import cohere
import pypdf
import weaviate
import weaviate.classes.config as wc
import weaviate.classes.query as wq
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
load_dotenv()

True

In [3]:
headers = {}
# In case the cohere vectorizer is used - watch out for limits
# headers.update({"X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"))
client_weaviate = weaviate.connect_to_local(headers=headers)
client_weaviate.is_ready()

True

In [4]:
client_weaviate.collections.delete("FCA")
client_weaviate.collections.create(
    name="FCA",
    properties=[
        wc.Property(name="idx", data_type=wc.DataType.INT, skip_vectorization=True),
        wc.Property(name="text", data_type=wc.DataType.TEXT),
    ],
    # Define the vectorizer module
    # vectorizer_config=wc.Configure.Vectorizer.text2vec_cohere(),
    # vectorizer_config=wc.Configure.Vectorizer.text2vec_huggingface(model="sentence-transformers/all-MiniLM-L6-v2"),
    # Define the generative module
    generative_config=wc.Configure.Generative.cohere()
)

<weaviate.collections.collection.Collection at 0x7f51c0410390>

In [5]:
def extract_text_from_pdf(pdf_file_name: Path) -> str:
    pdf_file = pypdf.PdfReader(pdf_file_name)
    return " ".join((page.extract_text() for page in pdf_file.pages))

In [6]:
def chunk_text(input_text: str, chunk_size: int, chunk_overlap: int) -> list:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.create_documents([input_text])

In [7]:
# Use sentence-transformer which does the computation locally

root_dir = Path(os.getenv("PDF_DIR"))
chunk_size = int(os.getenv("CHUNK_SIZE"))
chunk_overlap = int(os.getenv("CHUNK_OVERLAP"))

model = SentenceTransformer("all-MiniLM-L6-v2")
collection = client_weaviate.collections.get("FCA")

for pdf_file_name in root_dir.glob("*.pdf"):
    print(f"Processing {pdf_file_name.name}...")
    text = extract_text_from_pdf(pdf_file_name)
    chunked_text = chunk_text(text, chunk_size, chunk_overlap)
    for idx, chunk in enumerate(chunked_text):
        content = chunk.page_content
        collection.data.insert(
            {
                "idx": idx,
                "title": pdf_file_name.name,
                "text": content,
            },
            vector=model.encode(content),
        )
print("FINISHED - create embeddings")



Processing FCA1189-04.pdf...
Processing FCA1190-18.pdf...
Processing FCA1192-57.pdf...
Processing FCA1312-08.pdf...
Processing FCA1319-16.pdf...
Processing FCA1382-06.pdf...
Processing FCA1537-03.pdf...
Processing FCA1725-25.pdf...
Processing FCA3947-13.pdf...
Processing FCA4431-27.pdf...
Processing FCA4520-50.pdf...
Processing FCA4853-07.pdf...
Processing FCA4854-11.pdf...
Processing Overview Introducing AMS Components (FCA1318).pdf...
FINISHED - create embeddings


In [8]:
collection = client_weaviate.collections.get("FCA")
print(f"Inserted {len(collection)} embeddings")

Inserted 11685 embeddings


In [9]:
def get_near_vectors(collection_name: str, query_text: str, limit: int):
    collection = client_weaviate.collections.get(collection_name)
    embedding = model.encode(query_text).tolist()
    return collection.query.near_vector(
        near_vector=embedding,
        limit=limit,
        return_metadata=wq.MetadataQuery(distance=True),
        return_properties=["idx", "text", "title", ],
    )

In [26]:
preamble = """
## Task & Context
You help people answer their questions and other requests interactively. You will
be asked a very wide array of requests on all kinds of topics. You will be
equipped with a wide range of search engines or similar tools to help you,
which you use to research your answer. You should focus on serving the user's
needs as best you can, which will be wide-ranging.

## Style Guide
Unless the user asks for a different style of answer, you should answer in
full sentences, using proper grammar and spelling.
"""

near_vectors = get_near_vectors("FCA", "PermissionModelActive", 4)
context = [
    {"title": str(vector.properties["title"]), "snippet": vector.properties["text"]}
    for vector in near_vectors.objects
]
query = "Explain the setting PermissionModelActive. List the title of the documents."

client_cohere = cohere.Client(os.getenv("COHERE_API_KEY"))
chat_response = client_cohere.chat(
  message=query,
  documents=context,
  preamble=preamble,
  model="command-r",
  temperature=0.3
)

print("Final answer:")
print(chat_response.text)

Final answer:
The setting PermissionModelActive is mentioned in the following documents:
- FCA1190-18.pdf
- FCA1725-25.pdf

FCA1190-18.pdf explains that PermissionModelActive remains static and cannot be altered at runtime. It also provides information on how the PermissionModelActive flag interacts with the PermissionModel flag: the combination of the two flags' settings has some forbidden outcomes, which are illustrated in a table. Meanwhile, FCA1725-25.pdf indicates that the setting is useful when some users adopt the AMS model and others use the ADS model. It also states that the PermissionModelActive value affects the PrimaryDomain value:
- If PermissionModelActive is 0 or 2, the PrimaryDomain value must be ignored and left empty.
- If PermissionModelActive is 1 or 3, the PrimaryDomain value must be defined.
