# Generate regulatory documents for compliance submission with OCI Generative AI
### This notebook demonstrates
##### 1. Loading all the clinical trials data, creating embeddings on the content and metadata fields and loading into an Opensearch Index
##### 2. Use OCI Generative AI Embedding function to perform the embeddings
##### 3. Perform a vector search on the page content based on the best Title received from retriever interface
##### 4. Use expert prompting techniques to instruct OCI Generative AI LLM to summarize document sectionally
##### 5. Use langchain output parser to extract summarized output for various sections
##### 6. Use a pre-determined form template to fill in langchain outputs and create a compliance submission form


In [1]:
# Uncomment to load the neessary pip libaries if required
#!pip install opensearch-py 
#!pip install sentence-transformers
#!pip install langchain
#!pip install langchain_community
#!pip install pypdf
#!pip3 install fillpdf

In [2]:
import os
import pandas as pd
import json
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
# Read all PDF clinical trial data and load it into data
directory_path = "/home/datascience/conda/data/pdfs/"
loader = PyPDFDirectoryLoader(directory_path)
data = loader.load()
#data

In [4]:
from tabulate import tabulate
df_trials=pd.DataFrame(data,columns=['page_content','metadata','page'])
for i in range(0,len(data)):
    df_trials.page_content[i]=data[i].page_content
    df_trials.metadata[i]=data[i].metadata['source']
    df_trials.page[i]=data[i].metadata['page']
#df_trials

#### Generate OCI embeddings model and generate embeddings

In [5]:
#OCI Gen AI Embeddings
from langchain_community.embeddings import OCIGenAIEmbeddings

compartment_id = "ocid1.compartment.oc1..aaaaaaaa452ndb7pt7ygkd2l75ws5k6sxihp2f5xq5m3d6v4lb6lgzj2jniq"
service_endpoint='https://inference.generativeai.us-chicago-1.oci.oraclecloud.com'
model_id = "cohere.command"
embed_model_id="cohere.embed-english-v3.0"

oci_embeddings = OCIGenAIEmbeddings(
    model_id=embed_model_id,
    service_endpoint=service_endpoint,
    compartment_id=compartment_id,
    model_kwargs={"truncate":True}
)

In [6]:
#Create the OCI Gen AI Embeddings for the page_content and file metadata
pg_embeddings = oci_embeddings.embed_documents(texts=df_trials["page_content"].to_list())
mt_embeddings = oci_embeddings.embed_documents(texts=df_trials["metadata"].to_list())
EMBEDDING_DIM = len(pg_embeddings[0])
#print(EMBEDDING_DIM)

#### Connect and load all documents and file metadata into an opensearch index

In [7]:
import os
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

In [8]:
#connect to OS cluster
from opensearchpy import OpenSearch, helpers
from opensearchpy.client.http import HttpClient
host = [{'host': '10.0.3.229', 'port': 9200}]
auth = ('osmaster', 'Welcome#1234!')
client = OpenSearch(
    hosts=host,
    http_compress=True,
    http_auth=auth,
    use_ssl=True,
    verify_certs=False
)
client

<OpenSearch([{'host': '10.0.3.229', 'port': 9200}])>

In [9]:
#Create the document index
index_name='idx_oci_genai_clinical_trials'
index_body={
  "settings": {
    "index": {
      "knn": True,
      "knn.algo_param.ef_search": 100
    }
  },
  "mappings": { #how do we store,
    "properties": {
        "pg_embeddings": {
          "type": "knn_vector", #we are going to put
          "dimension": EMBEDDING_DIM,
          "method": {
            "name": "hnsw",
            "space_type": "cosinesimil",
            "engine": "lucene",
            "parameters": {
              "ef_construction": 128,
              "m": 24
            }
         }
     },
     "mt_embeddings": {
          "type": "knn_vector", #we are going to put
          "dimension": EMBEDDING_DIM,
          "method": {
            "name": "hnsw",
            "space_type": "cosinesimil",
            "engine": "lucene",
            "parameters": {
              "ef_construction": 128,
              "m": 24
            }
         }
     },"page_content": {
        "type": "text"
        },
     "metadata": {
            "type": "text"
        },
      "page": {
            "type": "text"
        }
      }
   }
}

In [10]:
# Delete and create/re-create the index
try:
    response=client.indices.delete(index=index_name)
    print(f"Index {index_name} deleted successfully")
except Exception as e:
    pass
try:
    response=client.indices.create(index=index_name, body=index_body)
    print(f'Index {index_name} created successfully')
except Exception as e:
    print(f"Error creating os settings: {e}")

Index idx_oci_genai_clinical_trials deleted successfully
Index idx_oci_genai_clinical_trials created successfully


In [11]:
# load titles and description data into the index
from opensearchpy import helpers

df = pd.DataFrame({
    "page_content": df_trials["page_content"].tolist(),
    "metadata": df_trials["metadata"].tolist(),
    "page": df_trials["page"].tolist(),
    "pg_embeddings": pg_embeddings,  # Convert embedding to list for DataFrame,
    "mt_embeddings": mt_embeddings,  # Convert embedding to list for DataFrame,
})

docs = df.to_dict(orient="records")

helpers.bulk(client, docs, index=index_name, raise_on_error=True, refresh=True)

(39, [])

### Query Open Search Index on document content based on the Titles 
##### 1. This query performs a vector search on page content and pulls up the relevant page content and the metadata
##### 2. The file metadata is then used to pull the actual content from the clinical trials file
##### 3. Optionally the code can be modified to query on metadata based embeddings as well

In [12]:
# Set the query embedding to 
user_query = "A Randomized, Controlled Trial of Lifestyle Intervention and Drug 'N' in Non-Alcoholic Fatty Liver Disease (NAFLD)"
num_docs = 2
query_embedding = oci_embeddings.embed_query(text=user_query)

In [13]:
# Retriever Query results based on vector searches on title embeddings
query_body = {
    "query": {"knn": {"pg_embeddings": {"vector": query_embedding, "k": f'{num_docs}'}}},
    "_source": False,
    "fields": ["page_content","metadata"],
}

In [14]:
results = client.search(
    body=query_body,
    index=index_name
)

pg_list=[]
mt_list=[]
score_list=[]
for i, result in enumerate(results["hits"]["hits"]):
    page_content = result['fields']['page_content'][0]
    metadata = result['fields']['metadata'][0]
    score = round(result['_score'],2)
    pg_list.append(page_content)
    mt_list.append(metadata)
    score_list.append(score)
    #print(f"{i+1}. Title: {Title}, Score: {score}")

#pg_table = {'Page Content' : pg_list, 'Score' : score_list}
#print(tabulate(pg_table, headers='keys', tablefmt='pretty', stralign = 'left', numalign = 'center'))
mt_table = {'Metadata' : mt_list, 'Score' : score_list}
print(tabulate(mt_table, headers='keys', tablefmt='pretty', stralign = 'left', numalign = 'center'))

+---------------------------------------------------------------------+-------+
| Metadata                                                            | Score |
+---------------------------------------------------------------------+-------+
| /home/datascience/conda/data/pdfs/NonAlcoholicFattyLiverDisease.pdf | 0.85  |
| /home/datascience/conda/data/pdfs/Non-AlcoholicSteatohepatitis.pdf  | 0.79  |
+---------------------------------------------------------------------+-------+


### Load Clinical trial content and generate Compliance submission documents
##### 1. Load full document from the actual clinical trials file.
##### 2. Note that the same can also be loaded from the opensearch page_content as well
##### 3. Use OCI Generative AI chat interface to summarize various sections with a targeted expert prompt
##### 4. Format the response with langchain output parser
##### 5. Create the compliance submission template

In [15]:
#Based on the actual file path in the metadata, load the actual text from the PDF File for the first record
from langchain.document_loaders import PyPDFLoader
file_path=mt_list[0]
file_name=file_path.rsplit('/',1)[1].rsplit('.',1)[0]
loader = PyPDFLoader(file_path)
clinical_trial_text = loader.load()
#print(clinical_trial_text)

In [16]:
# Import langchain pydantic output parser
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from typing import List

In [17]:
# set up Pydantic class    
class TrialInfo(BaseModel):
    title:str = Field(description="This is the name of the Title")
    introduction:str = Field(description="Summary of the introduction section of the report")
    methods:str = Field(description="Summary of the methods section of the report")
    patient_selection:str = Field(description="Summary of the patient selection section of the report")
    preparation:str = Field(description="Summary of the cell Preparation selection section of the report")
    trial_study_design:str = Field(description="Summary of the Trial Design selection section of the report")
    data_collection:str = Field(description="Summary of the data collection section of the report")
    dietary_and_physical:str = Field(description="Summary of the dietary and physical section of the report")
    endpoints:str = Field(description="Summary of the end point section of the report")    
    inclusion:str = Field(description="Summary of the inclusion criteria section of the report")
    exclusion:str = Field(description="Summary of the exclusion criteria section of the report") 
    outcomes:str = Field(description="Summary of the outome measures section of the report") 
    results:str = Field(description="Summary of the Trial Design selection section of the report")
    primary_outcome:str = Field(description="Summary of the Primary outcome section of the report")
    secondary_outcome:str = Field(description="Summary of the Secondary outcome selection section of the report")
    statistical_analysis:str = Field(description="Summary of any statistical analysis on report section of the report")
    safety:str = Field(description="Summary of the Safety selection section of the report")
    efficacy:str = Field(description="Summary of the Efficacy selection section of the report")
    characterization:str = Field(description="Summary of any characterization section of the report")
    discussion:str = Field(description="Summary of the discussion section of the report")
    conclusion:str = Field(description="Summary of the conclusion section of the report")
    registration:str = Field(description="Value in the Clinical Trial registration section in the report")

In [18]:
# Set up a parser + inject instructions into the prompt template.
pydantic_parser = PydanticOutputParser(pydantic_object=TrialInfo)
format_instructions = pydantic_parser.get_format_instructions()

In [19]:
#Define the template prompt string
template_string = """You are a master clinical trial analyst who specializes in clinical trials for cancer and diabetes. \
Take the clinical trial report below and then summarize each of the sections in a single line within 150 characters
clinical trial report: ```{clinical_trial_report}```
{format_instructions}
"""

In [20]:
from langchain.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(template=template_string)
messages = prompt.format_messages(clinical_trial_report=clinical_trial_text, 
                                format_instructions=format_instructions)

In [21]:
from langchain_community.llms import OCIGenAI
# use OCI GenAI chat interface
llm = OCIGenAI(
    model_id=model_id,
    service_endpoint=service_endpoint,
    compartment_id=compartment_id,
    model_kwargs={"max_tokens":1000}
)

In [22]:
response = llm.invoke(messages,temperature=0.0)

In [23]:
#check response from llm
import json
string = '{'+str(response).rsplit('{',1)[1].rsplit('}',1)[0]+'}'
#string=str(response).replace('```','').replace('json\n','').rsplit('}',1)[0]+'}'
data=json.loads(string)

In [24]:
from langchain.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(template=template_string)
messages = prompt.format_messages(clinical_trial_report=clinical_trial_text, 
                                format_instructions=format_instructions)

In [25]:
# PDF template generation - https://www.sejda.com/pdf-forms#results-KFINKPTF-202407230002
import fillpdf
from fillpdf import fillpdfs
form_template='/home/datascience/conda/data/templates/template_all.pdf'
form_fields=list(fillpdfs.get_form_fields(form_template, 
                                     sort=True, page_number=None).keys())

In [26]:
#https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwjW6Yim7LqHAxV1kokEHbxZBiUQwqsBegQIExAG&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DTcBX2kb6g3o&usg=AOvVaw02NenmjSMjrEyunvRtS6JQ&opi=89978449
data_dict = {
    form_fields[0]: data['characterization'],
    form_fields[1]: data['conclusion'],
    form_fields[2]: data['data_collection'],
    form_fields[3]: data['dietary_and_physical'],
    form_fields[4]: data['discussion'],
    form_fields[5]: data['efficacy'],
    form_fields[6]: data['endpoints'],
    form_fields[7]: data['exclusion'],
    form_fields[8]: data['inclusion'],
    form_fields[9]: data['introduction'],
    form_fields[10]: data['methods'],
    form_fields[11]: data['outcomes'],
    form_fields[12]: data['patient_selection'],
    form_fields[13]: data['primary_outcome'],
    form_fields[14]: data['preparation'],
    form_fields[15]: data['registration'],
    form_fields[16]: data['results'],
    form_fields[17]: data['safety'],
    form_fields[18]: data['secondary_outcome'],
    form_fields[19]: data['statistical_analysis'],
    form_fields[20]: data['title'],
    form_fields[21]: data['trial_study_design'],
}

In [27]:
output_path='/home/datascience/conda/data/outputs/trial_'
fillpdfs.write_fillable_pdf(form_template,
                            f'{output_path}{file_name}.pdf',
                            data_dict,flatten=True)
