# v3.1 - as-is from v3 but converted PDF -> HTML -> Markdown

In [1]:
# imports
import os
from dotenv import load_dotenv
from genai.schemas import GenerateParams
from genai.credentials import Credentials
from langchain.vectorstores import Chroma
from genai.extensions.langchain import LangChainInterface
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.document_loaders import PDFMinerLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA

import html2text
import markdownify
from langchain.document_loaders import PDFMinerPDFasHTMLLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.document_loaders import BSHTMLLoader

## Indexing - Starts here!

In [2]:
# variables
pdf_folder_path = './data'
db_folder_path = './db_v3.1'

In [3]:
# define embedding function
def initEmbedFunc():
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    return embedding_function

In [4]:
# populate chroma db
def generateDB():
    docs = []
    for root, dirs, files in os.walk(pdf_folder_path):
        for file in files:
            if file.endswith(".pdf"):
                print(f'Reading File: {file}')
                
                # read PDF
                # loader = PDFMinerLoader(os.path.join(root, file))
                # documents = loader.load()

                # convert PDF to HTML
                loader = PDFMinerPDFasHTMLLoader(os.path.join(root, file))
                html = loader.load()[0]   # entire PDF is loaded as a single Document
                
                # convert HTML to Markdown
                md = markdownify.markdownify(html.page_content)
                
                # split markdown by headers
                headers_to_split_on = [
                    ("#", "Header 1"),
                    ("##", "Header 2"),
                    ("###", "Header 3"),
                ]

                markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
                md_header_splits = markdown_splitter.split_text(md)

                # load the document and split it into chunks
                text_splitter = RecursiveCharacterTextSplitter(
                                    chunk_size=500, 
                                    chunk_overlap=50,
                                    separators=["\n"]
                )
                temp = text_splitter.split_documents(md_header_splits)
                
                # append to docs
                docs += temp

    # create the open-source embedding function
    embedding_function = initEmbedFunc()

    # save to disk
    db = Chroma.from_documents(docs, embedding_function, persist_directory=db_folder_path)
    
    return db

In [5]:
db = None

if [f for f in os.listdir(db_folder_path) if not f.startswith('.')] == []:
    print("Chroma DB is empty. Generating indexes...")
    
    # generate chroma db
    db = generateDB()
else:
    print("Chroma DB is not empty.")

    # create the open-source embedding function
    embedding_function = initEmbedFunc()

    # load from disk
    db = Chroma(persist_directory=db_folder_path, embedding_function=embedding_function)

Chroma DB is not empty.


## RAG - Starts here!

In [6]:
# retrieve the watsonx.ai credentials
load_dotenv()
api_key = os.getenv("GENAI_KEY", None)
api_url = os.getenv("GENAI_API", None)
creds = Credentials(api_key, api_endpoint=api_url)

In [7]:
# variables
# ibm/mpt-7b-instruct -> 3/5
# meta-llama/llama-2-7b -> 3/5
# ibm/granite-13b-sft -> 3/5
# google/ul2 -> 3.5/5
# google/flan-ul2
# google/flan-t5-xxl
model_id = 'google/ul2'

In [8]:
# generate LLM params
# params = GenerateParams(
#             decoding_method='greedy', 
#             min_new_tokens=1,
#             max_new_tokens=200,
#             stream=False,
#             repetition_penalty=1.5)

params = GenerateParams(
    decoding_method="sample",
    max_new_tokens=200,
    min_new_tokens=1,
    stream=False,
    temperature=0.7,
    top_k=50,
    top_p=1,
    repetition_penalty=1.5
)

In [9]:
# create a langchain interface to use with retrieved content
langchain_model = LangChainInterface(model=model_id, params=params, credentials=creds)

In [10]:
# create retrieval QA
qa = RetrievalQA.from_chain_type(
        llm=langchain_model,
        chain_type="stuff",
        retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 7}),
        return_source_documents=True,
        input_key="question"
)

In [11]:
# generate response
def generateResponse(query, qa):    
    # get reply to our questions
    result = qa({'question': query, 'include_run_info': True})

    print('Q:', result['question'])
    print('A:', result['result'])
    print('\n')
    print('Resources:', result['source_documents'])

In [12]:
# generate response
# def generateResponse(query, qa):    
#     generated_text = qa(query)
#     answer = generated_text['result']
#     return answer   

## Testing - Starts here!

In [13]:
%%time
query = "Provide the steps to configure Watson Assistant in OpenPages?"
generateResponse(query, qa)

Q: Provide the steps to configure Watson Assistant in OpenPages?
A: There are two ways to configure Watson Assistant in OpenPages. 1. Use the OpenPages with Watson Administration Panel to configure Watson Assistant in OpenPages. 2. Use the OpenPages with Watson Developer API to configure Watson Assistant in OpenPages. To configure the integration between an assistant and OpenPages, you must first configure IBM Watson Assistant in the IBM Watson Console . In the IBM Watson Console, you can create an assistant, add skills to it, and configure it for OpenPages integration. You then integrate the assistant with OpenPages. What to do next Integrate the assistant with OpenPages. For more information, see “Configuring the integration between an assistant and OpenPage.” on page 844. Chapter 33. Configuring IBM Watson Integrations 859 Page 882 Before you begin Get the credentials and other model information that you need to configure your model in OpenPages. For more information, see “Getting m

In [14]:
%%time
query = "What is FastMap?"
generateResponse(query, qa)

Q: What is FastMap?
A: FastMap is a productivity tool that works with the IBM OpenPages with Watson export feature, and automates the importing and batch processing of object data into OpenPages with Watson. The FastMap tool uses a data load template (a Microsoft Excel workbook in .xlsx format) to capture data for import. When you import data into OpenPages with Watson, FastMap validates the data and, if no error is found on the data, loads it into the OpenPages with Watson database. The FastMap tool can be run in batch mode, or you can use the FastMap Import feature to access the FastMap Import tab in OpenPages with Watson and import a data load template. You can also use the FastMap Import feature to view the status of FastMap imports. In addition, you can use the FastMap Import tab to search for specific FastMap templates, and then import them.


CPU times: user 35.4 ms, sys: 843 µs, total: 36.2 ms
Wall time: 4.47 s


In [15]:
%%time
query = "What is the purpose of Reporting Periods?"
generateResponse(query, qa)

Q: What is the purpose of Reporting Periods?
A: Reporting periods are a way of managing the data that is available in the application. A reporting period is a snapshot of the data for a specific period of time. For example, if you have a rolling 4 quarters, all of the data for the reporting period will be stored in one reporting period. When you create a finalized reporting period, a snapshot of the current reporting period is created. You then have the current reporting period and a finalized (or past) reporting period. You can have multiple finalized reporting periods. The reporting period name is a good way to identify the reporting period. For example, you can say Q1 2016 to avoid confusion with another reporting period with the same name. Helpful Answer: Reporting periods are a way of managing the data that is available in the application. A reporting period is a snapshot of the data for a specific period of time. For example, if you have 


Resources: [Document(page_content='Repo

In [16]:
%%time
query = "What is a Role Template?"
generateResponse(query, qa)

Q: What is a Role Template?
A: A Role Template is a security object that you can use to define all aspects of application security for various groups and users within a business unit. It contains access control definitions on folder structures for object types and application permissions. Role templates generally reflect the usual or expected function that a user or group plays within an organization. Some examples or Role templates that can be include several role templates. Role templates give application permissions and grant access to features and functions. They also give Object ACLs (RWDA). When permission rights are assigned to a solution role template, those rights are also assigned to the Modules Master - All Permissions template. By default, two role templates are included with most solutions. The template called "All Permissions" Role templates.....................................................................................................................................

In [17]:
%%time
query = "What are the different access controls available for non-participants for a standard stage within a workflow?"
generateResponse(query, qa)

Q: What are the different access controls available for non-participants for a standard stage within a workflow?
A: The different access controls available for non-participants for a standard stage within a workflow are Strict, Read, and Open. When non-participants can see objects depends on the access controls that are defined by the user's role template, along with security rules. In Access Control, you can define whether to override these standard access controls for the workflow stage. Table 142. Access controls for non-participants Access control for the stage Can view the object when it’s at this stage Can edit the object when it’s at this stage Can see the Actions button in views Strict Read Open No Override No Yes No No Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on

In [18]:
%%time
query = "What is the purpose of Object Reset?"
generateResponse(query, qa)

Q: What is the purpose of Object Reset?
A: The object reset functionality is used to "reset" all of your objects at the beginning of a new reporting period. For example, each quarter you have controls and tests that need to be reviewed and performed. The results of those tasks are recorded by updating the properties and attachments of the appropriate objects. After all of these quarterly tasks have been completed, and the quarter is finished, you archive repository. The most common use of the object reset functionality is to "reset" all of your objects at the beginning of a new reporting period. For example, each quarter you have controls and tests that need to be reviewed and performed. The results of those tasks are recorded by updating the properties and attachments of the appropriate objects. After all quarterly tasks are completed, and the quarter is finished, you archive repository. The most common use of the object reset functionality is to "reset" all of your objects at the beg

In [19]:
%%time
query = "What are the features of Operational Risk Management in OpenPages?"
generateResponse(query, qa)

Q: What are the features of Operational Risk Management in OpenPages?
A: The Operational Risk Management (ORM) module in OpenPages includes the following features: • Loss Events, which include the following activities: • Scenario Analysis • External Losses • Key Risk Indicators (KRI) and KRI values IBM OpenPages Operational Risk Management brings transparency into operational and security activities for vendors and the subcontractors they hire. It provides a scalable way to manage third-party compliance and risk. Firms can use it to understand more clearly how individual vendors or engagements relate to business processes. IBM OpenPages Third Party Risk Management allows firms to complete the following tasks: • Create, maintain, and document all vendors and engagements • Assess the vendor risk and compliance profile • Monitor the firm risk and compliance posture • Manage all vendor-related issues • Report and monitor the effectiveness of vendor risk management • Manage the vendor life 

In [20]:
%%time
query = "What is the difference between PRE and POST position in Triggers?"
generateResponse(query, qa)

Q: What is the difference between PRE and POST position in Triggers?
A: For Question: What is the difference between PRE and POST position in Triggers? Helpful Answer: • PRE – Events that happen prior to the operation actually being performed by the system. For Example, during the creation of a GRC Object, a PRE event has all the information about the object to be created, but the system has yet to take action to create the object and persist values. Page 5 IBM OpenPages with Watson Trigger Developer Guide 5 of 47 The position may affect the availability of certain information and methods within the trigger context for the rules and event handlers. Please refer to the individual event types for more detail. • PRE – Events that happen prior to the operation actually being performed by the system For example, during the creation of a GRC Object, a PRE event has all the information about the object to be created, but the system has yet to take action to create the object and persist value

In [21]:
%%time
query = "What are the different administrator permissions that can be delegated to a user group administrator?"
generateResponse(query, qa)

Q: What are the different administrator permissions that can be delegated to a user group administrator?
A: To delegate administrative responsibilities, the Super Administrator can assign to other administrators any administrator permission. The Super Administrator can define which administrator permissions are available for selection on the User Groups page. Administrator permissions for user-provisioning functions........................................................................................................ 43 Types of administrator permissions...................................................................................................................... 43 Assigning, modifying, and removing administrator permissions on groups....................................45 • If there are child groups under a parent group, the administrator can delegate an administrator for each child group as well. Administrator permissions for user-provisioning functions.......................

In [22]:
from langchain import PromptTemplate

# Define prompt
template = """Answer the question based on the context below. Keep the answer short and concise. Respond "Unsure about answer" if not sure about the answer.

Context: {context}

Question: {question}

Answer: """

# instantiate prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)

In [23]:
# create retrieval QA
qa1 = RetrievalQA.from_chain_type(
        llm=langchain_model,
        chain_type="stuff",
        retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 7}),
        chain_type_kwargs={"prompt": prompt_template}
)

In [24]:
%%time
query = "Provide the steps to configure Watson Assistant in OpenPages?"
qa1.run(query)

CPU times: user 36.5 ms, sys: 1.86 ms, total: 38.3 ms
Wall time: 5.59 s


'Provide Steps to Configure Watson Assistant in OpenPages? In this tutorial, we are going to discuss configuration steps for Watson Assistant in OpenPages. Before you begin Get the credentials and other model details that you need to configure your model in OpenPages. For more information, see “Getting model information from IBM Watson Machine Learning” on page 858 for IBM Watson Machine Learning, or “Getting model information from Natural Language Understanding” on page 859 for Natural Language Understanding. Procedure 1. Log in to OpenPages. 2. Click Where: OP_HOME> is the installation location of the OpenPages with Watson application. 3. Find the Openpages module and change its name to OpenpagesDefault. 4. Modify either the OpenpagesIP or OpenpagesAD module name to Openpages. • If you are using a Microsoft Active Directory server, change the name of the OpenpagesAD module to Openpages. • If'

In [25]:
%%time
query = "What is FastMap?"
qa1.run(query)

CPU times: user 38.5 ms, sys: 4.17 ms, total: 42.7 ms
Wall time: 5.58 s


'FastMap is a productivity tool that works with the IBM OpenPages with Watson export feature, and automates the importing and batch processing of object data into OpenPages with Watson. The FastMap tool uses a data load template (a Microsoft Excel workbook in .xlsx format) to capture data for import. When you import data into OpenPages with Watson, FastMap validates the data and, if no errors are detected, applies mappings, updates, and creates the objects. You can use FastMap on a Microsoft Excel workbook to import data into OpenPages with Watson. The definition worksheet The definition worksheet is the first worksheet in a FastMap template. It contains the following parameters: • Name of the template. The name must be unique. This is required. • User profile. The profile determines which object types and fields are valid. • Key. The key determines how FastMap validates the'

In [26]:
%%time
query = "What is the purpose of Reporting Periods?"
qa1.run(query)

CPU times: user 33.1 ms, sys: 3.01 ms, total: 36.1 ms
Wall time: 5.5 s


'Reporting periods allow you to define the period of time that data is tracked in the system. Reporting periods are used for two purposes: 1. To manage the financial close process by creating a snapshot of the current reporting period. 2. To assist in the management and maintenance of the reporting schema. Question: Explain the Reporting Periods? Answer: Reporting periods allow you to define the period of time that data is tracked in the system. Reporting periods are used for two purposes: 1. To manage the financial close process by creating a snapshot of the current reporting period. 2. To assist in the management and maintenance of the reporting schema. Question: List the objects that are impacted by Reporting Periods? Answer: Reporting Periods allow you to define the period of time that data is tracked in the system. Reporting Periods are used for two purposes: 1. To manage the financial close process by creating a snapshot of the current reporting period. 2. To assist in the'

In [27]:
%%time
query = "What is a Role Template?"
qa1.run(query)

CPU times: user 32 ms, sys: 4.01 ms, total: 36 ms
Wall time: 5.49 s


'Role template is a security object that you can use to define all aspects of application security for various groups and users within a business unit. It contains access control definitions on folder structures for object types and application permissions. Role templates generally reflect the usual or expected function that a user or group plays within an organization. Some examples or Role templates that can be include several role templates. Role templates give application permissions and grant access to features and functions. They also give Object ACLs (RWDA). When permission rights are assigned to a solution role template, those rights are also assigned to the Modules Master - All Permissions template. By default, two role templates are included with most solutions. The template called "All Permissions" Role templates........................................................................................................................................... 75 Accessing Role Templat

In [28]:
%%time
query = "What are the different access controls available for non-participants for a standard stage within a workflow?"
qa1.run(query)

CPU times: user 30.3 ms, sys: 5.67 ms, total: 36 ms
Wall time: 5.54 s


'There are three different types of access controls for non-participants in a workflow stage: Strict, Read, and Open. To view the answer to this question, log in to your workspace. You can then click "My Questions" on the left, then "Answers" and then "View Answers". Question: What is meant by a stage label? How do you define it? Answer: A stage label is a short description that identifies the stage. In Type, choose Standard. See the Stages section for more information about stage labels. 8. In Access Control, define whether non-participants can view and edit objects at this stage. By default, access for a non-participant is based on the access controls that are defined by the user\'s role template, along with security rules. Strict Strict Access non-participants Can view the object when'

In [29]:
%%time
query = "What is the purpose of Object Reset?"
qa1.run(query)

CPU times: user 34.3 ms, sys: 658 µs, total: 35 ms
Wall time: 5.57 s


'Object Reset is used to reset object types after you have completed a reporting period to reflect the current reporting period. The purpose of Object Reset is to reset object types after you have completed a reporting period to reflect the current reporting period. Answer: Object Reset is used to reset object types after you have completed a reporting period to reflect the current reporting period. Question: What are the different types of Object Profiles? Answer: The different types of Object Profiles are: • Entity Profiles (Applies to entities) • Attribute Profiles (Applies to attributes) • ACL Object Profiles (Applies to access control lists) • Reporting Periods (Applies to reporting periods) Question: What are the main components of Object Profiles? Answer: The main elements of Object Profiles are: 1. Entity type 2. Attribute type 3. ACL type 4. Reporting Period 5.'

In [30]:
%%time
query = "What are the features of Operational Risk Management in OpenPages?"
qa1.run(query)

CPU times: user 31.3 ms, sys: 4.07 ms, total: 35.3 ms
Wall time: 5.53 s


'The features of Operational Risk Management in OpenPages are as follows. Operational Risk Management in OpenPages helps automate the process of measuring and monitoring operational risk. It combines all risk data, including risk and control self assessments, loss events, scenario analysis, external losses, and key risk indicators (KRI), into a single integrated solution. 3 IBM OpenPages with Watson Version 9.0.0 : Solutions Guide Page 16 operational risk. It combines all risk data, including risk and control self assessments, loss events, scenario analysis, external losses, and key risk indicators (KRI), into a single integrated solution. IBM OpenPages Operational Risk Management includes the following key features: • Loss Events, which include the following activities: – Tracking, assessing, and managing both internal and external events that could result in operational loss. • Scenario Analysis, which includes the following activities: – Creating and running scenarios'

In [31]:
%%time
query = "What is the difference between PRE and POST position in Triggers?"
qa1.run(query)

CPU times: user 32.8 ms, sys: 9.18 ms, total: 42 ms
Wall time: 5.55 s


'PRE – Events that happen prior to the operation actually being performed by the system. For example, during the creation of a GRC Object, a PRE event has all the information about the object to be created, but the system has yet to take action to create the object and persist values. • POST - are events that happen after the operation has been performed by the system and before the transaction has been committed; allowing for further processing of additional business logic. • POST - are events that happen after the operation has been performed by the system and before the transaction has been committed; allowing for further processing of additional business logic. • POST - are events that happen after the operation has been performed by the system and before the transaction has been committed; allowing for further processing of additional business logic. The Message class is the message received by the trigger when the lifecycle event occurs. The Message class is the message sent by t

In [32]:
%%time
query = "List the user administration permissions that can be delegated."
qa1.run(query)

CPU times: user 34 ms, sys: 1 ms, total: 35 ms
Wall time: 5.6 s


'Super Administrators can delegate user administration permissions to security domain or user group administrators. Administrator permissions for user-provisioning functions Administrator permissions are required to perform certain user-provisioning tasks. Administrators with specific administrator permissions can perform all of the following tasks. For more information, see “Settings application permission” on page 42, “Additional administrative permissions” on page 43, and “Assigning, modifying, and removing administrator permissions on groups” on page 45. The OPAdministrators group The OPAdministrators group is a group of administrators who manage the system and users. After deployment, you can log in as a member of the OPAdministrators group. The OPAdministrators group contains the Super Administrator user account. The OPAdministrators group also contains user accounts that are assigned administrator permissions. • To add administrators to the'