# Generate regulatory documents for compliance submission with OCI Generative AI
##### 1. Connecting to an Opensearch vector store.
##### 2. Use Open source pre-trained Hugging face sentence transformer vector encoding model
##### 3. Load Clinical Trials Title and description metadta along with its embeddings data to opensearch cluster index
##### 4. Search Clinical Trials based on Title and description embeddings and view scores


In [22]:
# Uncomment to load the neessary pip libaries if required
#!pip install opensearch-py 
#!pip install sentence-transformers
#!pip install langchain
#!pip install langchain_community
#!pip install tabulate

In [23]:
import os
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

In [24]:
#Get and set the variables used in the notebook 
#NOTE: It is recommended you delete host name, user or password after execution of your notebook for the workshop is over.
#Usage of the above credentials is for illustration purposes only, not for production workloads
os_host='<Opensearch host API Private IP>'
os_user='<Opensearch username>' # i,e osmaster
os_pass='<Opensearch password>'
compartment_id = '<your compartment ocid>'
service_endpoint='https://inference.generativeai.us-chicago-1.oci.oraclecloud.com'
#print(f'Host: {os_host} user: {os_user} pass: {os_pass}') 

In [58]:
#connect to OS cluster
from opensearchpy import OpenSearch, helpers
from opensearchpy.client.http import HttpClient
host = [{'host': f'{os_host}', 'port': 9200}]
auth = (f'{os_user}', f'{os_pass}')
client = OpenSearch(
    hosts=host,
    http_compress=True,
    http_auth=auth,
    use_ssl=True,
    verify_certs=False
)
client

<OpenSearch([{'host': '10.0.3.229', 'port': 9200}])>

## Using sentence transformer open source external pre-trained models
###### Sentence transformer models are open source hugging face pre-trained models that can be used as an embedding function. You may use any pre-trained models available for the embedding function.
###### In this section, the embedding function is applied to pre-create embeddings and then loaded into the opensearch cluster index
###### Embeddings are created both on the Title and descriptions of the clinical trials
###### Any open source models can be used and retriever scores can be compared
###### Refer to [page here](https://opensearch.org/docs/latest/ml-commons-plugin/pretrained-models/) for any additional model names

In [59]:
from sentence_transformers import SentenceTransformer
model_name = "all-MiniLM-L6-v2"

model = SentenceTransformer(model_name)
EMBEDDING_DIM = model.encode(["Sample sentence"])[0].shape[0]
#print(EMBEDDING_DIM)

In [60]:
#Load all clinical trial metadata (Filename, Tites and descriptions) into a dataframe
import pandas as pd
file_path = '/home/datascience/conda/data/clinical_trial_titles.csv'
columns = ['Filename', 'Title','Description']
df_trials = pd.read_csv(file_path, usecols=columns)
#len(df_trials)

In [61]:
# Define an Opensearch index with embeddings on title and description fields
# We view retrieval scores based on titles and description vector embeddings
# The embeddings are created in this notebook and stored into this Opensearch index 
index_name=f'idx-{model_name}'.lower()
title_embeddings = model.encode(df_trials["Title"])
description_embeddings = model.encode(df_trials["Description"])
index_body={
  "settings": {
    "index": {
      "knn": True,
      "knn.algo_param.ef_search": 100
    }
  },
  "mappings": { #how do we store,
    "properties": {
        "title_embeddings": {
          "type": "knn_vector", #we are going to put
          "dimension": EMBEDDING_DIM,
          "method": {
            "name": "hnsw",
            "space_type": "l2",
            "engine": "nmslib",
            "parameters": {
              "ef_construction": 128,
              "m": 24
            }
         }
     },
        "description_embeddings": {
          "type": "knn_vector", #we are going to put
          "dimension": EMBEDDING_DIM,
          "method": {
            "name": "hnsw",
            "space_type": "l2",
            "engine": "nmslib",
            "parameters": {
              "ef_construction": 128,
              "m": 24
            }
         }
     },
         "Title": {
            "type": "text"
            },
         "Description": {
                "type": "text"
            }
        }
   }
}

In [62]:
# Delete and create/re-create the index
try:
    response=client.indices.delete(index=index_name)
    print("Index deleted successfully")
except Exception as e:
    pass
try:
    response=client.indices.create(index=index_name, body=index_body)
    print(f'Index {index_name} created successfully')
except Exception as e:
    print(f"Error creating os settings: {e}")

Index deleted successfully
Index idx-all-minilm-l6-v2 created successfully


In [63]:
# load titles and description data with the embeddings into appropriate model index
from opensearchpy import helpers

df = pd.DataFrame({
    "Title": df_trials["Title"].tolist(),
    "Description": df_trials["Description"].tolist(),
    "title_embeddings": title_embeddings.tolist(),  # Convert embedding to list for DataFrame,
    "description_embeddings": description_embeddings.tolist(),  # Convert embedding to list for DataFrame,
})

docs = df.to_dict(orient="records")

helpers.bulk(client, docs, index=index_name, raise_on_error=True, refresh=True)

(30, [])

### Run Queries and check retriever results
##### In this section, query the Opensearch index based on Title and description embeddings separately
##### The user can then choose the best top n retriever results based on the score

In [64]:
# Define the user query
user_query = "Find all diabetic clinical trials"
num_titles = 4
query_embedding = model.encode((user_query))

In [65]:
# Retriever Query results based on vector searches on title embeddings
query_body = {
    "query": {"knn": {"title_embeddings": {"vector": query_embedding, "k": f'{num_titles}'}}},
    "_source": False,
    "fields": ["Title", "Description"],
}

In [66]:
from tabulate import tabulate
results = client.search(
    body=query_body,
    index=index_name
)

title_list=[]
score_list=[]
for i, result in enumerate(results["hits"]["hits"]):
    Title = result['fields']['Title'][0]
    Description = result['fields']['Description'][0]
    score = round(result['_score'],2)
    title_list.append(Title)
    score_list.append(score)
    #print(f"{i+1}. Title: {Title}, Score: {score}")
table = {'Title' : title_list, 'Score' : score_list}
print(tabulate(table, headers='keys', tablefmt='pretty', stralign = 'left', numalign = 'center'))

+-----------------------------------------------------------------------------------------------+-------+
| Title                                                                                         | Score |
+-----------------------------------------------------------------------------------------------+-------+
| Once-Daily, Dual-Action Diabetes Drug (AG-200) Trial                                          | 0.57  |
| A Randomized Controlled Trial of Diet and Drug Intervention for Type 2 Diabetes Management    | 0.52  |
| A Phase IV Cardiovascular Outcomes Trial of Diabetes Drug X": Safety and Efficacy Evaluation" | 0.48  |
| Evaluation of ND-300 for Type 3 Diabetes Management                                           | 0.47  |
+-----------------------------------------------------------------------------------------------+-------+


In [67]:
# Retriever Query results based on vector searches on description embeddings
query_body = {
    "query": {"knn": {"description_embeddings": {"vector": query_embedding, "k": f'{num_titles}'}}},
    "_source": False,
    "fields": ["Title", "Description"],
}

In [68]:
results = client.search(
    body=query_body,
    index=index_name
)

title_list=[]
score_list=[]
for i, result in enumerate(results["hits"]["hits"]):
    Title = result['fields']['Title'][0]
    Description = result['fields']['Description'][0]
    score = round(result['_score'],2)
    title_list.append(Title)
    score_list.append(score)
    #print(f"{i+1}. Title: {Title}, Score: {score}")
table = {'Title' : title_list, 'Score' : score_list}
print(tabulate(table, headers='keys', tablefmt='pretty', stralign = 'left', numalign = 'center'))

+--------------------------------------------------------------------------------------------+-------+
| Title                                                                                      | Score |
+--------------------------------------------------------------------------------------------+-------+
| A Randomized Controlled Trial of Diet and Drug Intervention for Type 2 Diabetes Management | 0.49  |
| Once-Daily, Dual-Action Diabetes Drug (AG-200) Trial                                       | 0.49  |
| Evaluation of AGI-101 in Glycemic Control of Type 2 Diabetes                               | 0.48  |
| Evaluation of MET-200 for Improving Insulin Sensitivity and Metabolic Control              | 0.48  |
+--------------------------------------------------------------------------------------------+-------+


### Vector embeddings and search with OCI Generative 
##### In this section, we will create embeddings for Titles and Descriptions with OCI Generative AI Embedding model
##### Load the Titles, description and their embeddings into an Opensearch Index

In [69]:
#OCI Gen AI Embeddings
from langchain_community.embeddings import OCIGenAIEmbeddings
from ads.llm import GenerativeAIEmbeddings

embed_model_id="cohere.embed-english-v3.0"

oci_embeddings = OCIGenAIEmbeddings(
    model_id=embed_model_id,
    service_endpoint=service_endpoint,
    compartment_id=compartment_id,
    model_kwargs={"truncate":True}
)

title_embeddings = oci_embeddings.embed_documents(texts=df_trials["Title"].to_list())
description_embeddings = oci_embeddings.embed_documents(texts=df_trials["Description"].to_list())
EMBEDDING_DIM = len(title_embeddings[0])
#print(EMBEDDING_DIM)

In [70]:
model_name='cohere.embed-english-v3.0'
index_name=f'idx-{model_name}'.lower()
index_body={
  "settings": {
    "index": {
      "knn": True,
      "knn.algo_param.ef_search": 100
    }
  },
  "mappings": { #how do we store,
    "properties": {
        "title_embeddings": {
          "type": "knn_vector", #we are going to put
          "dimension": EMBEDDING_DIM,
          "method": {
            "name": "hnsw",
            "space_type": "cosinesimil",
            "engine": "lucene",
            "parameters": {
              "ef_construction": 128,
              "m": 24
            }
         }
     },
        "description_embeddings": {
          "type": "knn_vector", #we are going to put
          "dimension": EMBEDDING_DIM,
          "method": {
            "name": "hnsw",
            "space_type": "cosinesimil",
            "engine": "lucene",
            "parameters": {
              "ef_construction": 128,
              "m": 24
            }
         }
     },
         "Title": {
            "type": "text"
            },
         "Description": {
                "type": "text"
            }
        }
   }
}

In [71]:
# Delete and create/re-create the index
try:
    response=client.indices.delete(index=index_name)
    print("Index deleted successfully")
except Exception as e:
    pass
try:
    response=client.indices.create(index=index_name, body=index_body)
    print(f'Index {index_name} created successfully')
except Exception as e:
    print(f"Error creating os settings: {e}")

Index deleted successfully
Index idx-cohere.embed-english-v3.0 created successfully


In [72]:
# load titles and description data into appropriate model index
from opensearchpy import helpers

df = pd.DataFrame({
    "Title": df_trials["Title"].tolist(),
    "Description": df_trials["Description"].tolist(),
    "title_embeddings": title_embeddings,  # Convert embedding to list for DataFrame,
    "description_embeddings": description_embeddings,  # Convert embedding to list for DataFrame,
})

docs = df.to_dict(orient="records")

helpers.bulk(client, docs, index=index_name, raise_on_error=True, refresh=True)

(30, [])

### Run Queries and check retriever results
##### In this section, query the Opensearch index based on Title and description embeddings separately
##### The user can then choose the best top n retriever results based on the score

In [73]:
# Set the query embedding to 
user_query = "Find all liver transplantation trials"
num_titles = 4
query_embedding = oci_embeddings.embed_query(text=user_query)

In [74]:
# Retriever Query results based on vector searches on title embeddings
query_body = {
    "query": {"knn": {"title_embeddings": {"vector": query_embedding, "k": f'{num_titles}'}}},
    "_source": False,
    "fields": ["Title", "Description"],
}

In [75]:
results = client.search(
    body=query_body,
    index=index_name
)

title_list=[]
score_list=[]
for i, result in enumerate(results["hits"]["hits"]):
    Title = result['fields']['Title'][0]
    Description = result['fields']['Description'][0]
    score = round(result['_score'],2)
    title_list.append(Title)
    score_list.append(score)
    #print(f"{i+1}. Title: {Title}, Score: {score}")

table = {'Title' : title_list, 'Score' : score_list}
print(tabulate(table, headers='keys', tablefmt='pretty', stralign = 'left', numalign = 'center'))

+-----------------------------------------------------------------------------------------------------+-------+
| Title                                                                                               | Score |
+-----------------------------------------------------------------------------------------------------+-------+
| A Prospective, Single-Arm Trial of an Innovative Liver Transplantation Technique                    | 0.81  |
| A Randomized, Double-Blind, Placebo-Controlled Trial of 'Cirrinol' for Liver Cirrhosis Treatment    | 0.81  |
| A Randomized, Open-Label Trial of Stem Cell Therapy for Acute Liver Failure                         | 0.79  |
| A Phase 3 Randomized Clinical Trial of an Investigational Drug for Newly Diagnosed Multiple Myeloma | 0.78  |
+-----------------------------------------------------------------------------------------------------+-------+


In [76]:
# Retriever Query results based on vector searches on description embeddings
query_body = {
    "query": {"knn": {"description_embeddings": {"vector": query_embedding, "k": f'{num_titles}'}}},
    "_source": False,
    "fields": ["Title", "Description"],
}

In [77]:
results = client.search(
    body=query_body,
    index=index_name
)

title_list=[]
score_list=[]
for i, result in enumerate(results["hits"]["hits"]):
    Title = result['fields']['Title'][0]
    Description = result['fields']['Description'][0]
    score = round(result['_score'],2)
    title_list.append(Title)
    score_list.append(score)
    #print(f"{i+1}. Title: {Title}, Score: {score}")
    
table_2 = {'Title' : title_list, 'Score' : score_list}
print(tabulate(table_2, headers='keys', tablefmt='pretty', stralign = 'left', numalign = 'center'))

+-------------------------------------------------------------------------------------------------------------------------+-------+
| Title                                                                                                                   | Score |
+-------------------------------------------------------------------------------------------------------------------------+-------+
| A Prospective, Single-Arm Trial of an Innovative Liver Transplantation Technique                                        | 0.81  |
| A Randomized, Double-Blind, Placebo-Controlled Trial of the Effects of Drug 'M' on Non-Alcoholic Steatohepatitis (NASH) | 0.76  |
| A Randomized, Double-Blind, Placebo-Controlled Trial of 'Cirrinol' for Liver Cirrhosis Treatment                        | 0.75  |
| A Randomized, Controlled Trial of Lifestyle Intervention and Drug 'N' in Non-Alcoholic Fatty Liver Disease (NAFLD)      | 0.74  |
+---------------------------------------------------------------------------

#### TAKEWAY NOTES: 
##### 1. You can use this notebook to compare retriever results from different models on the same Title and Description metadata 
##### 2. You can create retrieval scores by loading any other detail text columns into an index