In [3]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [10]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    PyPDFLoader("docs/Collibra-Output-Module-2022.06.pdf"),
]

docs = []
for loader in loaders:
    docs.extend(loader.load())

In [11]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [12]:
splits = text_splitter.split_documents(docs)

In [13]:
len(splits)

114

In [22]:
print(splits[100])

page_content='linkType Thisparameter allowsspecifying thepaththatshouldbe\nfollowed fromtheparentresource toauser.Whenthe\nparentresource isresponsibility orgroup,linkType is\nnotusedandtherelationship definedforresponsibility or\ngroupisused.Whenauseristheparentnode,linkType\ndetermines therelationship withthechildresources that\nhaveacreatedorlastmodified kindofrelationship. See\nrelations above.Thepossible valuesareCREATED,\nMODIFIED, "CREATED_ OR_MODIFIED orCREATED\nORMODIFIED. CREATED_ OR_MODIFIED isthedefault\nvalue,butcanonlybeusedwhenUserisrootofthequery\ntree.\xa0CREATED_ OR_MODIFIED turnsintoasimple\nCREATED whenUserisnottherootofthequery.\nEmail\nExtends Resource\nEmailrepresents oneoftheuser\'semailaddresses. Itcanonlybeusedasachildofthe\nuserser.\nProperties\nemailAddress Text Theemailaddress.\nPhone\nExtends Resource\nPhonerepresents oneoftheuser\'sphonenumbers. Itcanonlybeusedasachildofthe\nuser.\nPropertiesChapter 6\n95' metadata={'source': 'docs/Collibra-Output-Module-

In [19]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [20]:
from langchain.vectorstores import Chroma

In [21]:
persist_directory = 'docs/chroma/'

In [23]:
!rm -rf ./docs/chroma  # remove old database files if any

In [26]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [27]:
print(vectordb._collection.count())

114


In [32]:
question = "What is the output module in Collibra?"

In [33]:
docs = vectordb.similarity_search(question,k=3)

In [34]:
len(docs)

3

In [35]:
docs[0].page_content

'iTheHitchhiker’s GuidetotheOutputModule\nRevision: 02Jun2022\nYoucanfindthemostup-to-datetechnical documentation onourDeveloper portalat\nhttps://developer.collibra.com/rest/output- module/\n©2022Collibra.AllRightsReserved.'

In [37]:
print(docs)

[Document(page_content='iTheHitchhiker’s GuidetotheOutputModule\nRevision: 02Jun2022\nYoucanfindthemostup-to-datetechnical documentation onourDeveloper portalat\nhttps://developer.collibra.com/rest/output- module/\n©2022Collibra.AllRightsReserved.', metadata={'page': 1, 'source': 'docs/Collibra-Output-Module-2022.06.pdf'}), Document(page_content='2Introduction\nTheOutputModuleisalightweight graphqueryengineexposed throughthepublicAPI.It\nallowsdifferent outputformats, suchasJSON,XML,Excel,andCSV.Italsoprovides a\nsingleAPItoquerymostoftheCollibraentities,suchasassets,communities, domains\nandtypes,usingSQL-likefilteringcapabilities. Youcansortentitiesusinganyofthe\navailable properties andpageresultsandviewpermissions forauthenticated userswho\nissueRESTcalls.Chapter 2', metadata={'page': 7, 'source': 'docs/Collibra-Output-Module-2022.06.pdf'}), Document(page_content='3Prerequisites\nBeforeyoubeginusingthequerylanguage usedintheOutputModule, youmust\nunderstand theCollibraAPImodelandho

In [38]:
vectordb.persist()

In [46]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.llms import OpenAI

In [43]:
embedding = OpenAIEmbeddings()
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

In [44]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [47]:
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [53]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [54]:
question = "What is the output module?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

OutputModule
----------------------------------------------------------------------------------------------------
Document 2:

"You can find the most up-to-date technical documentation on our Developer portal at https://developer.collibra.com/rest/output-module/"
----------------------------------------------------------------------------------------------------
Document 3:

"The Output Module is a lightweight graph query engine exposed through the public API."


In [55]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo-0301


In [56]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'docs/chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [57]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)

In [58]:
from langchain.chains import RetrievalQA

In [62]:
question = "Create an output query that can find the communities with the name JP test"

In [63]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [64]:
result = qa_chain({"query": question})

In [65]:
result["result"]

'Here\'s an example of an output query that can find the communities with the name "JP test":\n\nJSON\n{\n  "ViewConfig": {\n    "Columns": [\n      {\n        "name": "Community ID",\n        "property": "communityId"\n      },\n      {\n        "name": "Community Name",\n        "property": "communityName"\n      }\n    ],\n    "Resources": {\n      "Community": {\n        "name": "Communities",\n        "filters": [\n          {\n            "property": "communityName",\n            "operator": "equals",\n            "value": "JP test"\n          }\n        ],\n        "Id": {\n          "name": "communityId"\n        },\n        "Name": {\n          "name": "communityName"\n        }\n      }\n    }\n  }\n}\n\nThis query will return a table with the Community ID and Community Name columns for all communities with the name "JP test". You can use this query with the following endpoints:\n\n- {{domain}}/rest/2.0/outputModule/export/json\n- {{domain}}/rest/2.0/outputModule/export/csv\n