# v3.2 - as-is from v3 but converted PDF -> HTML -> Text

In [1]:
# imports
import os
from dotenv import load_dotenv
from genai.schemas import GenerateParams
from genai.credentials import Credentials
from langchain.vectorstores import Chroma
from genai.extensions.langchain import LangChainInterface
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.document_loaders import PDFMinerLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA

import html2text
import markdownify
from langchain.document_loaders import PDFMinerPDFasHTMLLoader
from langchain.schema.document import Document

## Indexing - Starts here!

In [2]:
# variables
pdf_folder_path = './data'
db_folder_path = './db_v3.2'

In [3]:
# define embedding function
def initEmbedFunc():
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    return embedding_function

In [4]:
# populate chroma db
def generateDB():
    docs = []
    for root, dirs, files in os.walk(pdf_folder_path):
        for file in files:
            if file.endswith(".pdf"):
                print(f'Reading File: {file}')
                
                # read PDF
                # loader = PDFMinerLoader(os.path.join(root, file))
                # documents = loader.load()

                # convert PDF to HTML
                loader = PDFMinerPDFasHTMLLoader(os.path.join(root, file))
                html = loader.load()[0]   # entire PDF is loaded as a single Document
                
                # convert HTML to Text
                text = html2text.html2text(html.page_content)

                # load the document and split it into chunks
                text_splitter = RecursiveCharacterTextSplitter(
                                    chunk_size=500, 
                                    chunk_overlap=50,
                                    separators=["\n"]
                )
                temp = [Document(page_content=x) for x in text_splitter.split_text(text)]
                
                # append to docs
                docs += temp

    # create the open-source embedding function
    embedding_function = initEmbedFunc()

    # save to disk
    db = Chroma.from_documents(docs, embedding_function, persist_directory=db_folder_path)
    
    return db

In [5]:
db = None

if [f for f in os.listdir(db_folder_path) if not f.startswith('.')] == []:
    print("Chroma DB is empty. Generating indexes...")
    
    # generate chroma db
    db = generateDB()
else:
    print("Chroma DB is not empty.")

    # create the open-source embedding function
    embedding_function = initEmbedFunc()

    # load from disk
    db = Chroma(persist_directory=db_folder_path, embedding_function=embedding_function)

Chroma DB is not empty.


## RAG - Starts here!

In [6]:
# retrieve the watsonx.ai credentials
load_dotenv()
api_key = os.getenv("GENAI_KEY", None)
api_url = os.getenv("GENAI_API", None)
creds = Credentials(api_key, api_endpoint=api_url)

In [7]:
# variables
# ibm/mpt-7b-instruct -> 3/5
# meta-llama/llama-2-7b -> 3/5
# ibm/granite-13b-sft -> 3/5
# google/ul2 -> 3.5/5
# google/flan-ul2
# google/flan-t5-xxl
model_id = 'google/ul2'

In [8]:
# generate LLM params
# params = GenerateParams(
#             decoding_method='greedy', 
#             min_new_tokens=1,
#             max_new_tokens=200,
#             stream=False,
#             repetition_penalty=1.5)

params = GenerateParams(
    decoding_method="sample",
    max_new_tokens=200,
    min_new_tokens=1,
    stream=False,
    temperature=0.5,
    top_k=50,
    top_p=1,
    repetition_penalty=1.5
)

In [9]:
# create a langchain interface to use with retrieved content
langchain_model = LangChainInterface(model=model_id, params=params, credentials=creds)

In [10]:
# create retrieval QA
qa = RetrievalQA.from_chain_type(
        llm=langchain_model,
        chain_type="stuff",
        retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 7}),
        return_source_documents=True,
        input_key="question"
)

In [11]:
# generate response
def generateResponse(query, qa):    
    # get reply to our questions
    result = qa({'question': query, 'include_run_info': True})

    print('Q:', result['question'])
    print('A:', result['result'])
    print('\n')
    print('Resources:', result['source_documents'])

In [12]:
# generate response
# def generateResponse(query, qa):    
#     generated_text = qa(query)
#     answer = generated_text['result']
#     return answer   

## Testing - Starts here!

In [13]:
%%time
query = "Provide the steps to configure Watson Assistant in OpenPages?"
generateResponse(query, qa)

Q: Provide the steps to configure Watson Assistant in OpenPages?
A: Watson Assistant is a chatbot that you can use to engage with your customers and help them find the information they need. By integrating Watson Assistant with OpenPages, you can configure IBM Watson Assistant to work with OpenPages. For more information, see “Configuring the integration between an assistant and OpenPages ” on page 844. OpenPages integration. What to do next Integrate the assistant with OpenPages. For more information, see “Configuring the integration between an assistant and OpenPages ” on page 844. Configuring IBM Watson Assistant is an iterative process. As users work with it, you can improve and expand the skills. You might need to change or expand the skills as they change over time. You can also download a skills data usage report and improve it. OpenPages integration. What to do next Integrate the assistant with OpenPages. For more information, see “Configuring the integration between an


Resou

In [14]:
%%time
query = "What is FastMap?"
generateResponse(query, qa)

Q: What is FastMap?
A: FastMap is a data load utility that allows you to load data from an external system into the application. FastMap can also be used to export data from the application and import it into an external system. FastMap uses Microsoft Excel worksheets to define the data load template. This template is then imported into the application. The worksheet can be used to load data into the application. • The FastMap template import process...........................................................................................................................................................769 FastMap Page 816 • The FastMap template export process.............................................................................................................................................................769 FastMap Page 817 • The FastMap data validation process......................................................................................................................

In [15]:
%%time
query = "What is the purpose of Reporting Periods?"
generateResponse(query, qa)

Q: What is the purpose of Reporting Periods?
A: They are a way to freeze the data in the current reporting period. Finalized reporting periods are useful when you are creating reports. You can also use them to keep a snapshot of the data from previous reporting periods. About this task When you disable a finalized reporting period, it is not available to users through the UI, such as through > Change Reporting Period. Reporting period permissions and settings To manage reporting periods, the user performing the reporting period operation must belong to a group with the specific application permissions. The amount of time after a reporting period is created in which can also use Reporting Period to select the reporting period. > Change Note: You might need to refresh the screen to see the Change Reporting Period menu item. Creating a finalized a reporting period You can create finalized reporting periods. When you finalize the current reporting period, a snapshot of the current reportin

In [16]:
%%time
query = "What is a Role Template?"
generateResponse(query, qa)

Q: What is a Role Template?
A: A role template is a security object that you can use to define all aspects of application security for various groups and users within a business unit. It contains access control definitions on folder structures for object types and application permissions. Role templates generally reflect the usual or expected function that a user or group plays within an organization. Some examples or Role templates that can be defined are Process Owner, Control Owner, and Tester. The template can then be applied to different Users/Groups for a specific security context. Helpful Answer: A security context point is a point defined in the OpenPages security model that you can use to assign under that folder. role An instance of a role template that is applied to a set of Users/Groups for a specific security context. Roles are granted to Users/Groups which allows them access to objects with certain permissions. Some examples of roles are: Process Owner, Control Owner, and

In [17]:
%%time
query = "What are the different access controls available for non-participants for a standard stage within a workflow?"
generateResponse(query, qa)

Q: What are the different access controls available for non-participants for a standard stage within a workflow?
A: Dependent on the stage access control Strict Read Open No Override No Yes No No Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on s

In [18]:
%%time
query = "What is the purpose of Object Reset?"
generateResponse(query, qa)

Q: What is the purpose of Object Reset?
A: The purpose of the object reset is to automatically modify objects that exist in the IBM OpenPages with Watson repository. The most common use of the object reset functionality is to "reset" all of your objects at the beginning of a new reporting period. For example, each quarter you have controls and tests that need to be reviewed and performed. The results of those tasks are recorded by updating the properties and attachments of the appropriate objects. After all quarterly tasks are completed, and the attestation. Object resets The most common use of the object reset functionality is to "reset" all of your objects at the beginning of a new reporting period. For example, each quarter you have controls and tests that need to be reviewed and performed. The results of those tasks are recorded by updating the properties and attachments of the appropriate objects. After all quarterly tasks are completed, and the attestation. Object resets The most

In [19]:
%%time
query = "What are the features of Operational Risk Management in OpenPages?"
generateResponse(query, qa)

Q: What are the features of Operational Risk Management in OpenPages?
A: IBM OpenPages Operational Risk Management enables organizations to analyze, manage, and mitigate risk in a simple and efficient manner. IBM OpenPages Operational Risk Management helps automate the process of measuring and monitoring operational risk. It combines all risk data, including risk and control self assessments, loss events, scenario analysis, external losses, and key risk indicators (KRI), into a single integrated solution. IBM OpenPages Operational Risk Management includes the following key features: • The IBM OpenPages RiskLens integration enables an OpenPages user to push a record for inclusion within a Risk Assessment in RiskLens. The object can be associated to one or more Scenarios within RiskLens and Monte Carlo simulations are performed. After completing the simulation, and in accordance with the scheduled job in OpenPages, the loss exposure metrics generated by the Monte Carlo simulation are sen

In [20]:
%%time
query = "What is the difference between PRE and POST position in Triggers?"
generateResponse(query, qa)

Q: What is the difference between PRE and POST position in Triggers?
A: The difference between PRE and POST is that PRE events happen prior to the operation actually being performed by the system. For example, during the creation of a GRC Object, a PRE event has all the information about the object to be created, but the system has yet to take action to create the object and persist values. POST events happen after the operation has been performed by the system and before the transaction has been committed; allowing for further processing of additional business logic. The position may affect the availability of certain information and methods within the trigger context for the rules and event handlers. Please refer to the individual event types for more detail. Page 7 IBM OpenPages with Watson Trigger Developer Guide 7 of 47 Transactions • • • • event=”create.object” position=”pre” event=”create.object” position=”post” event=”update.object” position=”pre” event=”update.object


Resourc

In [21]:
%%time
query = "List the user administration permissions that can be delegated."
generateResponse(query, qa)

Q: List the user administration permissions that can be delegated.
A: Create Security Domains and User Groups. Helpful Answer: To create a new security domain or user group, you must be a Super Administrator or a delegated administrator with the appropriate permissions. To access the > Users and Security > Domains & Groups menu item, you must have Browse permission on any security domain or any user group. • To access the > Users and Security > Users menu item, you must have Browse permission on any security domain or any user group. • To access the > Users and Security > Domains & Groups menu item, you must be a Super Administrator or a delegated administrator with Super Administrator permission on the Security Domains security domain group. For information about delegating and assigning administrator permissions, see “Delegate administrator permissions” on page 42. • For information about the administrative permissions that are required for specific user-provisioning functions, see “

In [22]:
from langchain import PromptTemplate

# Define prompt
template = """Answer the question based on the context below. Keep the answer short and concise. Respond "Unsure about answer" if not sure about the answer.

Context: {context}

Question: {question}

Answer: """

# instantiate prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)

In [23]:
# create retrieval QA
qa1 = RetrievalQA.from_chain_type(
        llm=langchain_model,
        chain_type="stuff",
        retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 7}),
        chain_type_kwargs={"prompt": prompt_template}
)

In [24]:
%%time
query = "Provide the steps to configure Watson Assistant in OpenPages?"
qa1.run(query)

CPU times: user 33.5 ms, sys: 797 µs, total: 34.3 ms
Wall time: 5.61 s


'Complete the following steps to configure IBM Watson Assistant: 1. Configure an assistant. For more information, see “Configuring a web chat assistant by using IBM Watson Assistant” on page 843. 2. Integrate the assistant with OpenPages. For more information, see “Configuring the integration between an assistant and OpenPages” on page 844. OpenPages integration. What to do next Integrate the assistant with OpenPages. For more information, see “Configuring the integration between an assistant and OpenPages” on page 844. Configuring IBM Watson Assistant is an iterative process. As users work with it, you can improve and expand the skills. You might need to change or expand the skills as they change over time. You can also download a skills data usage report and improve it. OpenPages integration. What to do next Integrate the assistant with OpenPages. For more information, see “Configuring'

In [25]:
%%time
query = "What is FastMap?"
qa1.run(query)

CPU times: user 40.2 ms, sys: 0 ns, total: 40.2 ms
Wall time: 4.71 s


'FastMap is a tool that allows you to import and export data from an external system into the PeopleSoft Enterprise system. FastMap is a tool that allows you to import and export data from an external system into the PeopleSoft Enterprise system. FastMap is a tool that allows you to import and export data from an external system into the PeopleSoft Enterprise system. FastMap is a tool that allows you to import and export data from an external system into the PeopleSoft Enterprise system. FastMap is a tool that allows you to import and export data from an external system into the PeopleSoft Enterprise system. FastMap is a tool that allows you to import and export data from an external system into the PeopleSoft Enterprise system. FastMap is a tool that allows you to import and export data from an external system into the PeopleSoft Enterprise system. FastMap is a tool that allows you to import and export data from an external system into the PeopleSoft Enterprise system'

In [26]:
%%time
query = "What is the purpose of Reporting Periods?"
qa1.run(query)

CPU times: user 34.4 ms, sys: 501 µs, total: 34.9 ms
Wall time: 4.47 s


'Reporting Periods are used to create snapshots of data at a specific point in time. When you create a finalized reporting period, a snapshot of the current reporting period is created. You then have the current reporting period and a finalized (or past) reporting period. You can have multiple finalized reporting periods. Reporting periods can be enabled or disabled. When a finalized reporting period is disabled, it is not available to users through the UI, such as through > Change Reporting Period. Using System Admin Mode with reporting periods When you create, finalize, or delete reporting periods, you need to be in System Admin Mode (SAM). For more information, see “Enabling and disabling System Admin Mode” on page 37. Reporting period permissions and settings To manage reporting periods, the user performing the reporting period operation must belong to a group with the specific application permissions. The amount of'

In [27]:
%%time
query = "What is a Role Template?"
qa1.run(query)

CPU times: user 34 ms, sys: 0 ns, total: 34 ms
Wall time: 4.45 s


'A role template is a security object that you can use to define all aspects of application security for various groups and users within a business unit. It contains access control definitions on folder structures for object types and application permissions. Role templates generally reflect the usual or expected function that a user or group plays within an organization. Some examples or Role templates that can be defined are Process Owner, Control Owner, and Tester. The template can then be applied to different Users/Groups for a specific security context. Chapter 6. Security 76 Page 98 security context point A point defined in the OpenPages security model that you can use to assign under that folder. role An instance of a role template that is applied to a set of Users/Groups for a specific security context. Roles are granted to Users/Groups which allows them access to objects with certain permissions. Some examples of roles are: Process Owner, Control Owner, and'

In [28]:
%%time
query = "What are the different access controls available for non-participants for a standard stage within a workflow?"
qa1.run(query)

CPU times: user 35.5 ms, sys: 261 µs, total: 35.7 ms
Wall time: 4.46 s


'There are three different access controls available for non-participants for a standard stage within a workflow. They are: Strict Read Open No Override Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard

In [29]:
%%time
query = "What is the purpose of Object Reset?"
qa1.run(query)

CPU times: user 34.8 ms, sys: 0 ns, total: 34.8 ms
Wall time: 4.49 s


'The purpose of Object Reset is to automatically modify objects that exist in the IBM OpenPages with Watson repository. The most common use of the object reset functionality is to "reset" all of your objects at the beginning of a new reporting period. For example, each quarter you have controls and tests that need to be reviewed and performed. The results of those tasks are recorded by updating the properties and attachments of the appropriate objects. After all quarterly tasks are completed, and the attestation is complete, you can "reset" the objects to start a new reporting period. The object reset functionality can be used to perform other types of operations as well. For example, you can use it to update the status of objects to "In Progress" or "Completed" after they have been reviewed and performed, or to update the status of objects after they have been marked as "In Progress." Object resets are a way to automatically modify objects that'

In [30]:
%%time
query = "What are the features of Operational Risk Management in OpenPages?"
qa1.run(query)

CPU times: user 35.2 ms, sys: 835 µs, total: 36 ms
Wall time: 4.48 s


'IBM OpenPages Operational Risk Management enables organizations to analyze, manage, and mitigate risk in a simple and efficient manner. IBM OpenPages Operational Risk Management helps automate the process of measuring and monitoring operational risk. It combines all risk data, including risk and control self assessments, loss events, scenario analysis, external losses, and key risk indicators (KRI), into a single integrated solution. IBM OpenPages Operational Risk Management includes the following key features: • Operational risk analysis – Issue identification • Operational risk measurement • Operational risk monitoring • Operational risk reporting • Operational risk monitoring • Operational risk monitoring • Operational risk monitoring • Operational risk monitoring • Operational risk monitoring • Operational risk monitoring • Operational risk monitoring • Operational risk monitoring • Operational risk monitoring • Operational risk monitoring • Operational risk monitoring • Operat'

In [31]:
%%time
query = "What is the difference between PRE and POST position in Triggers?"
qa1.run(query)

CPU times: user 40.9 ms, sys: 5.77 ms, total: 46.6 ms
Wall time: 4.47 s


'PRE and POST positions are used for Triggers. PRE - are events that happen prior to the operation actually being performed by the system. For example, during the creation of a GRC Object, a PRE event has all the information about the object to be created, but the system has yet to take action to create the object and persist values. POST - are events that happen after the operation has been performed by the system and before the transaction has been committed; allowing for further processing of additional business logic. The position may affect the availability of certain information and methods within the trigger context for the rules and event handlers. Please refer to the individual event types for more detail. Page 7 IBM OpenPages with Watson Trigger Developer Guide 7 of 47 Context: The trigger can be configured to listen for either one or other position. The possible values are: • PRE – Events that happen prior to the operation actually being performed by'

In [32]:
%%time
query = "List the user administration permissions that can be delegated."
qa1.run(query)

CPU times: user 32.2 ms, sys: 8.57 ms, total: 40.7 ms
Wall time: 4.5 s


'Create Manage Lock Unlock Browse Browse Groups Browse Users Browse Groups Browse Users Super Administrators can delegate administrator permissions to other administrators. For example, a Super Administrator can delegate user provisioning functions to other administrators. For more information, see “Delegate administrator permissions” on page 42. • For information about the administrative permissions that are required for specific user-provisioning functions, see “Types of administrator permissions” on page 43. Users and groups are organized under the following top-level groups: • Security Domains This group is a container for the security domain groups that are permissions, see “Delegate administrator permissions” on page 42. For example, a Super Administrator can delegate user provisioning functions to other administrators. For more information, see “Administrator permissions for user-provisioning functions” on page 43. A Super Administrator is specified during the installation proce