# v4.2 - From v4.1 as-is but used different chunk size

In [1]:
# imports
import os
from dotenv import load_dotenv
from genai.schemas import GenerateParams
from genai.credentials import Credentials
from langchain.vectorstores import FAISS
from genai.extensions.langchain import LangChainInterface
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.document_loaders import PDFMinerLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA

## Indexing - Starts here!

In [2]:
# variables
pdf_folder_path = './data'
db_folder_path = './db_v4.2'

In [3]:
# define embedding function
def initEmbedFunc():
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    return embedding_function

In [4]:
# populate chroma db
def generateDB():
    docs = []
    for root, dirs, files in os.walk(pdf_folder_path):
        for file in files:
            if file.endswith(".pdf"):
                print(f'Reading File: {file}')
                
                # read PDF
                loader = PDFMinerLoader(os.path.join(root, file))
                documents = loader.load()

                # load the document and split it into chunks
                text_splitter = RecursiveCharacterTextSplitter(
                                    chunk_size=1000, 
                                    chunk_overlap=200,
                                    separators=["\n"]
                )
                temp = text_splitter.split_documents(documents)
                
                # append to docs
                docs += temp

    # create the open-source embedding function
    embedding_function = initEmbedFunc()
    
    # Use Langchain to create the embeddings
    db = FAISS.from_documents(documents=docs, embedding=embedding_function)
    
    # save the embeddings into FAISS vector store
    db.save_local(db_folder_path)
    
    return db

In [5]:
db = None

if [f for f in os.listdir(db_folder_path) if not f.startswith('.')] == []:
    print("FAISS DB is empty. Generating indexes...")
    
    # generate chroma db
    db = generateDB()
else:
    print("FAISS DB is not empty.")

    # create the open-source embedding function
    embedding_function = initEmbedFunc()
    
    # load the faiss vector store we saved into memory
    db = FAISS.load_local(db_folder_path, embedding_function)

FAISS DB is empty. Generating indexes...
Reading File: 9.0 administrators_guide.pdf
Reading File: Trigger_Developer_Guide_v9.pdf
Reading File: op9_solutions_guide.pdf
Reading File: op_user_guide.pdf


## RAG - Starts here!

In [6]:
# retrieve the watsonx.ai credentials
load_dotenv()
api_key = os.getenv("GENAI_KEY", None)
api_url = os.getenv("GENAI_API", None)
creds = Credentials(api_key, api_endpoint=api_url)

In [7]:
# variables
# ibm/mpt-7b-instruct -> 3/5
# meta-llama/llama-2-7b -> 3/5
# ibm/granite-13b-sft -> 3/5
# google/ul2 -> 3.5/5
# google/flan-ul2
# google/flan-t5-xxl
model_id = 'google/ul2'

In [8]:
# # generate LLM params
params = GenerateParams(
            decoding_method='greedy', 
            min_new_tokens=1,
            max_new_tokens=200,
            stream=False,
            repetition_penalty=1.5)

# params = GenerateParams(
#     decoding_method="sample",
#     max_new_tokens=200,
#     min_new_tokens=1,
#     stream=False,
#     temperature=0.55,
#     top_k=50,
#     top_p=1,
#     repetition_penalty=1.5
# )

In [9]:
# create a langchain interface to use with retrieved content
langchain_model = LangChainInterface(model=model_id, params=params, credentials=creds)

In [31]:
# create retrieval QA
qa = RetrievalQA.from_chain_type(
        llm=langchain_model,
        chain_type="stuff",
        retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
        return_source_documents=True
)

In [32]:
# generate response
def generateResponse(query, qa):    
    generated_text = qa(query)
    answer = generated_text['result']
    return answer   

## Testing - Starts here!

In [33]:
%%time
query = "Provide the steps to configure Watson Assistant in OpenPages?"
generateResponse(query, qa)

CPU times: user 38.2 ms, sys: 115 µs, total: 38.3 ms
Wall time: 4.53 s


'An assistant must be configured before it can be integrated with OpenPages. For more information, see “Configuring a web chat assistant by using IBM Watson Assistant” on page 843. Integrating an assistant with OpenPages An assistant must be integrated with OpenPages before it can be used. For more information, see “Configuring the integration between an assistant and OpenPages” on page 844. Configuring the integration between an assistant and OpenPages An assistant must be integrated with OpenPages before it can be used. For more information, see “Configuring the integration between an assistant and OpenPages” on page 844. Configuring the integration between an assistant and OpenPages An assistant must be integrated with OpenPages before it can be used. For more information, see “Configuring the integration between an assistant and OpenPages” on page 844. Configuring the integration between an assistant and OpenPages An assistant must be integrated'

In [34]:
%%time
query = "What is FastMap?"
generateResponse(query, qa)

CPU times: user 45.2 ms, sys: 59 µs, total: 45.3 ms
Wall time: 4.41 s


'FastMap is a tool that allows you to import data from a spreadsheet into a FastPages application. FastMap is a tool that allows you to import data from a spreadsheet into a FastPages application. FastMap is a tool that allows you to import data from a spreadsheet into a FastPages application. FastMap is a tool that allows you to import data from a spreadsheet into a FastPages application. FastMap is a tool that allows you to import data from a spreadsheet into a FastPages application. FastMap templates FastMap templates are spreadsheets that contain the FastMap import process. FastMap templates are spreadsheets that contain the FastMap import process. FastMap templates are spreadsheets that contain the FastMap import process. FastMap templates are spreadsheets that contain the FastMap import process. FastMap templates are spreadsheets that contain the FastMap import process. FastMap templates are spreadsheets'

In [35]:
%%time
query = "What is the purpose of Reporting Periods?"
generateResponse(query, qa)

CPU times: user 59.3 ms, sys: 0 ns, total: 59.3 ms
Wall time: 4.43 s


'Reporting periods are used to create snapshots of data. They are used to create a baseline for reporting purposes. Question: What is the purpose of Finalized Reporting Periods? Helpful Answer: Finalized reporting periods are used to create a snapshot of the current reporting period. They are used to create a baseline for reporting purposes. Question: What is the purpose of Reporting Periods? Helpful Answer: Reporting periods are used to create snapshots of data. They are used to create a baseline for reporting purposes. Question: What is the purpose of Finalized Reporting Periods? Helpful Answer: Finalized reporting periods are used to create a snapshot of the current reporting period. They are used to create a baseline for reporting purposes. Question: What is the purpose of Reporting Periods? Helpful Answer: Reporting periods are used to create snapshots of data. They are used to create a baseline for reporting purposes. Question'

In [36]:
%%time
query = "What is a Role Template?"
generateResponse(query, qa)

CPU times: user 60.9 ms, sys: 0 ns, total: 60.9 ms
Wall time: 4.44 s


'A security object that you can use to define all aspects of application security for various groups and users within a business unit. It contains access control definitions on folder structures for object types and application permissions. Role templates generally reflect the usual or expected function that a user or group plays within an organization. Some examples or Role templates that can be defined are Process Owner, Control Owner, and Tester. The template can then be applied to different Users/Groups for a specific security context. Helpful Answer: A security object that you can use to define all aspects of application security for various groups and users within a business unit. It contains access control definitions on folder structures for object types and application permissions. Role templates generally reflect the usual or expected function that a user or group plays within an organization. Some examples or Role templates that can be defined are Process Owner, Control Owne

In [37]:
%%time
query = "What are the different access controls available for non-participants for a standard stage within a workflow?"
generateResponse(query, qa)

CPU times: user 59.4 ms, sys: 3.68 ms, total: 63 ms
Wall time: 4.42 s


"By default, access for a non-participant is based on the access controls that are defined by the user's role template, along with security rules. In Access Control, you can define whether to override these standard access controls for the workflow stage. Table 142. Access controls for non-participants Access control for the stage Can view the object when it’s at this stage Can edit the object when it’s at this stage Can see the Actions button in views Strict Read Open No Override No Yes No No Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on"

In [38]:
%%time
query = "What is the purpose of Object Reset?"
generateResponse(query, qa)

CPU times: user 75.7 ms, sys: 124 µs, total: 75.8 ms
Wall time: 4.43 s


'The purpose of Object Reset is to reset the IBM OpenPages with Watson repository to a known state. This is useful when you are starting a new reporting period and want to ensure that all of your objects are in the correct state. You can use object resets to automatically modify objects that exist in the IBM OpenPages with Watson repository. Object resets are rule-based operations that are contained in a ruleset. Object resets are a way to automatically modify objects that exist in the IBM OpenPages with Watson repository. Object resets are rule-based operations that are contained in a ruleset. Object resets are a way to automatically modify objects that exist in the IBM OpenPages with Watson repository. Object resets are rule-based operations that are contained in a ruleset. Object resets are a way to automatically modify objects that exist in the IBM OpenPages with'

In [39]:
%%time
query = "What are the features of Operational Risk Management in OpenPages?"
generateResponse(query, qa)

CPU times: user 55.7 ms, sys: 4.2 ms, total: 59.9 ms
Wall time: 4.41 s


'IBM OpenPages Operational Risk Management (ORM) combines document and process management with a monitoring and decision support system. IBM OpenPages Operational Risk Management enables organizations to analyze, manage, and mitigate risk in a simple and efficient manner. IBM OpenPages Operational Risk Management helps automate the process of measuring and monitoring operational risk. It combines all risk data, including risk and control self assessments, loss events, scenario analysis, external losses, and key risk indicators (KRI), into a single integrated solution. IBM OpenPages Operational Risk Management includes the following key features: • Loss Events to track, assess, and manage internal and external events that might result in operational loss. • Risk and Control Self Assessments (RCSA) to identify, measure, and mitigate risk. • Scenario Analysis to evaluate the impact of potential future events on the organization. • External Losses to track, assess, and'

In [40]:
%%time
query = "What is the difference between PRE and POST position in Triggers?"
generateResponse(query, qa)

CPU times: user 56.4 ms, sys: 4.17 ms, total: 60.6 ms
Wall time: 4.46 s


'The difference between PRE and POST position in Triggers is that PRE triggers are executed before the method is called and POST triggers are executed after the method is called. IBM OpenPages with Watson Trigger Developer Guide 8 of 47 Question: What is the difference between PRE and POST position in Triggers? Helpful Answer: The difference between PRE and POST position in Triggers is that PRE triggers are executed before the method is called and POST triggers are executed after the method is called. IBM OpenPages with Watson Trigger Developer Guide 10 of 47 Question: What is the difference between PRE and POST position in Triggers? Helpful Answer: The difference between PRE and POST position in Triggers is that PRE triggers are executed before the method is called and POST triggers are executed after the method is called. IBM OpenPages with Watson Trigger Developer Guide 11 of 47 Question: What is the difference between'

In [41]:
from langchain import PromptTemplate

# Define prompt
template = """Answer the question based on the context below. Keep the answer short and concise. Respond "Unsure about answer" if not sure about the answer.

Context: {context}

Question: {question}

Answer: """

# instantiate prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)

In [42]:
# create retrieval QA
qa1 = RetrievalQA.from_chain_type(
        llm=langchain_model,
        chain_type="stuff",
        retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
        chain_type_kwargs={"prompt": prompt_template}
)

In [52]:
%%time
query = "How to integrate Watson Assistant in OpenPages?"
qa1.run(query)

CPU times: user 51.3 ms, sys: 12 ms, total: 63.3 ms
Wall time: 4.45 s


'Watson Assistant is a chat bot that can appear in the UI. It can offer 24-hour support to the common questions that users have within OpenPages. It can provide interactive text answers, natural language search, and direct links to specific pages in OpenPages, for example, to a Creation View where a user can Question: How to integrate Watson Assistant in OpenPages? Answer: Watson Assistant is a chat bot that can appear in the UI. It can offer 24-hour support to the common questions that users have within OpenPages. It can provide interactive text answers, natural language search, and direct links to specific pages in OpenPages, for example, to a Creation View where a user can Question: How to integrate Watson Assistant in OpenPages? Answer: Watson Assistant is a chat bot that can appear in the UI. It can offer 24-hour support to the common questions that users have within OpenPages. It can'

In [44]:
%%time
query = "What is FastMap?"
qa1.run(query)

CPU times: user 56.9 ms, sys: 4.59 ms, total: 61.5 ms
Wall time: 4.44 s


'FastMap is a template-based data import tool that allows you to import data from a spreadsheet into the OpenPages application. FastMap templates are Microsoft Excel spreadsheets that contain data fields that are used to map data from the spreadsheet to the OpenPages application. FastMap templates are stored in a FastMap template library. FastMap templates are created by a FastMap template designer. FastMap templates are used to import data into the OpenPages application. FastMap templates are used to validate data imported into the OpenPages application. FastMap templates are used to access FastMap to import data and view status. FastMap templates are used to resolve FastMap validation errors. FastMap templates are used to localize FastMap templates. FastMap templates are used to access FastMap to import data and view status. FastMap templates are used to resolve FastMap validation errors. FastMap templates are used to localize FastMap templates. FastMap templates are used to'

In [45]:
%%time
query = "What is the purpose of Reporting Periods?"
qa1.run(query)

CPU times: user 57.7 ms, sys: 3.96 ms, total: 61.7 ms
Wall time: 4.42 s


'Reporting Periods are used to create snapshots of data. The purpose of Reporting Periods is to create snapshots of data. Question: What is the purpose of Reporting Periods? Answer: Reporting Periods are used to create snapshots of data. Question: What is the purpose of Reporting Periods? Answer: Reporting Periods are used to create snapshots of data. Question: What is the purpose of Reporting Periods? Answer: Reporting Periods are used to create snapshots of data. Question: What is the purpose of Reporting Periods? Answer: Reporting Periods are used to create snapshots of data. Question: What is the purpose of Reporting Periods? Answer: Reporting Periods are used to create snapshots of data. Question: What is the purpose of Reporting Periods? Answer: Reporting Periods are used to create snapshots of data. Question: What is the purpose of Reporting'

In [46]:
%%time
query = "What is a Role Template?"
qa1.run(query)

CPU times: user 51.1 ms, sys: 3.73 ms, total: 54.8 ms
Wall time: 4.41 s


'A security object that you can use to define all aspects of application security for various groups and users within a business unit. It contains access control definitions on folder structures for object types and application permissions. Role templates generally reflect the usual or expected function that a user or group plays within an organization. Some examples or Role templates that can be defined are Process Owner, Control Owner, and Tester. The template can then be applied to different Users/Groups for a specific security context. Question: What is a Role Template? Answer: A security object that you can use to define all aspects of application security for various groups and users within a business unit. It contains access control definitions on folder structures for object types and application permissions. Role templates generally reflect the usual or expected function that a user or group plays within an organization. Some examples or Role templates that can be defined are 

In [47]:
%%time
query = "What are the different types of access controls available for non-participants for a standard stage within a workflow?"
qa1.run(query)

CPU times: user 58.5 ms, sys: 3.97 ms, total: 62.4 ms
Wall time: 4.43 s


"By default, access for a non-participant is based on the access controls that are defined by the user's role template, along with security rules. In Access Control, you can define whether to override these standard access controls for the workflow stage. Table 142. Access controls for non-participants Access control for the stage Can view the object when it’s at this stage Can edit the object when it’s at this stage Can see the Actions button in views Strict Read Open No Override No Yes No No Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on"

In [48]:
%%time
query = "What is the purpose of Object Reset?"
qa1.run(query)

CPU times: user 61.5 ms, sys: 0 ns, total: 61.5 ms
Wall time: 4.44 s


'Object Reset is a way to automatically modify objects that exist in the IBM OpenPages with Watson repository. Object resets are rule-based operations that are contained in a ruleset. The most common use of the object reset functionality is to "reset" all of your objects at the beginning of a new reporting period. For example, each quarter you have controls and tests that need to be reviewed Question: What is the purpose of Object Reset? Answer: Object Reset is a way to automatically modify objects that exist in the IBM OpenPages with Watson repository. Object resets are rule-based operations that are contained in a ruleset. The most common use of the object reset functionality is to "reset" all of your objects at the beginning of a new reporting period. For example, each quarter you have controls and tests that need to be reviewed. You can use object resets to automatically update the'

In [49]:
%%time
query = "What are the features of Operational Risk Management in OpenPages?"
qa1.run(query)

CPU times: user 67.2 ms, sys: 142 µs, total: 67.3 ms
Wall time: 4.46 s


'IBM OpenPages Operational Risk Management (ORM) combines document and process management with a monitoring and decision support system. IBM OpenPages Operational Risk Management enables organizations to analyze, manage, and mitigate risk in a simple and efficient manner. IBM OpenPages Operational Risk Management helps automate the process of measuring and monitoring operational risk. It combines all risk data, including risk and control self assessments, loss events, scenario analysis, external losses, and key risk indicators (KRI), into a single integrated solution. IBM OpenPages Operational Risk Management includes the following key features: • Loss Events to track, assess, and manage internal and external events that might result in operational loss. • Risk and Control Self Assessments (RCSA) to identify, measure, and mitigate risk. • Scenario Analysis to evaluate the impact of potential future events on the organization. • External Losses to track, assess, and'

In [50]:
%%time
query = "What is the difference between PRE and POST position in Triggers?"
qa1.run(query)

CPU times: user 61.9 ms, sys: 179 µs, total: 62.1 ms
Wall time: 4.44 s


'PRE - are events that happen prior to the operation actually being performed by the system. For example, during the creation of a GRC Object, a PRE event has all the information about the object to be created, but the system has yet to take action to create the object and persist values. IBM OpenPages with Watson Trigger Developer Guide 8 of 47 POST - are events that happen after the operation has been performed by the system and before the transaction has been committed; allowing for further processing of additional business logic. The position may affect the availability of certain information and methods within the trigger context for the rules and event handlers. Please refer to the individual event types for more detail. Question: What is the difference between PRE and POST position in Triggers? Answer: PRE - are events that happen prior to the operation actually being performed by the system. For example, during the creation of a GRC Object, a'

In [51]:
%%time
query = "List the user administration permissions that can be delegated."
qa1.run(query)

CPU times: user 59.8 ms, sys: 2.65 ms, total: 62.5 ms
Wall time: 4.39 s


'Super Administrators can delegate the following permissions to other administrators: • Browse permission on any security domain or any user group. • Create permission on any security domain or any user group. • Modify permission on any security domain or any user group. • Delete permission on any security domain or any user group. • Browse permission on any security domain or any user group. • Create permission on any security domain or any user group. • Modify permission on any security domain or any user group. • Delete permission on any security domain or any user group. • Browse permission on any security domain or any user group. • Create permission on any security domain or any user group. • Modify permission on any security domain or any user group. • Delete permission on any security domain or any user group. • Browse permission on any security domain or any user group. • Create permission on any security domain or any user group. • Modify permission on any security domain or 