# Generate regulatory documents for compliance submission with OCI Generative AI
### Hybrid search illustration

Hybrid search uses a combination of lexical search and neural search in opensearch.This requires defining a search pipeline with weights corresponding to various techniques.Hybrid search illustration is developed with hugging face sentence transformer model on OCI Opensearch. However note, that the associated indexes, pipelines, models and model groups are built externally with the Opensearch devtools interface

In [1]:
# Uncomment to load the neessary pip libaries if required
#!pip install opensearch-py 
#!pip install sentence-transformers
#!pip install langchain
#!pip install langchain_community

In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

In [3]:
#connect to OS cluster
from opensearchpy import OpenSearch, helpers
from opensearchpy.client.http import HttpClient
host = [{'host': '10.0.3.229', 'port': 9200}]
auth = ('osmaster', 'Welcome#1234!')
client = OpenSearch(
    hosts=host,
    http_compress=True,
    http_auth=auth,
    use_ssl=True,
    verify_certs=False
)
client

<OpenSearch([{'host': '10.0.3.229', 'port': 9200}])>

### Opensearch registered model embeddings to load Clinical data
##### To use Opensearch Neural search or hybrid search, the model needs to be registered in Opensearch or connected with a connector for remote API enabled models
##### Hugging face pre-trained sentence transformer model idx-msmarco-distilbert-base-tas-b (768 dimensions) is used in this case.  
##### Since model load and deployment involves asynchronous tasks in Opensearch, the model registration is created with Open search dev tools (for easier coding)
##### The model name and the model_ids retrieved are then hardcoded for creating the opensearch index pipeline below

In [158]:
model_name='msmarco-distilbert-base-tas-b'
model_id='1wvDKZEBWEIPcnamv7eS'

In [159]:
ingest_pl=f'pl-{model_name}'
ingest_pl_body={
  "description": "Pipeline to ingest clinical trials data for model msmarco-distilbert-base-tas-b",
  "processors" : [
    {
      "text_embedding": {
        "model_id": f"{model_id}",
        "field_map": {
           "Title": "title_embeddings",
           "Description": "description_embeddings"
        }
      }
    }
  ]
}

In [160]:
# Delete and create/re-create the pipeline
try:
    response=client.ingest.delete_pipeline(id=ingest_pl)
    print(f"Pipeline {pipeline_name} deleted successfully")
except Exception as e:
    pass
try:
    response=client.ingest.put_pipeline(id=ingest_pl, body=ingest_pl_body)
    print(f'Index {pipeline_name} created successfully')
except Exception as e:
    print(f"Error creating os settings: {e}")

Pipeline pl-msmarco-distilbert-base-tas-b deleted successfully
Index pl-msmarco-distilbert-base-tas-b created successfully


In [161]:
#Create the index with msmarco-distilbert-base-tas-b
index_name=f'idx-{model_name}'
index_body= {
    "settings": {
        "index.knn": True,
        "default_pipeline": f"{pipeline_name}"
    },
    "mappings": {
        "properties": {
            "title_embeddings": {
                "type": "knn_vector",
                "dimension": 768,
                "method": {
                    "name":"hnsw",
                    "engine":"lucene",
                    "space_type": "l2",
                    "parameters":{
                        "m":512,
                        "ef_construction": 245
                    }
                }
            },
            "description_embeddings": {
                "type": "knn_vector",
                "dimension": 768,
                "method": {
                    "name":"hnsw",
                    "engine":"lucene",
                    "space_type": "l2",
                    "parameters":{
                        "m":512,
                        "ef_construction": 245
                    }
                }
            },
            "Title": {
                "type": "text"
            },
            "Description": {
                "type": "text"
            }
        }
    }
}

In [162]:
# Delete and create/re-create the index
try:
    response=client.indices.delete(index=index_name)
    print(f"Index {index_name} deleted successfully")
except Exception as e:
    pass
try:
    response=client.indices.create(index=index_name, body=index_body)
    print(f'Index {index_name} created successfully')
except Exception as e:
    print(f"Error creating os settings: {e}")

Index idx-msmarco-distilbert-base-tas-b deleted successfully
Index idx-msmarco-distilbert-base-tas-b created successfully


In [163]:
#Load clinical trial metadata (Filename, Tites and descriptions) into a dataframe
import pandas as pd
file_path = '/home/datascience/conda/data/clinical_trial_titles.csv'
columns = ['Filename', 'Title','Description']
df_trials = pd.read_csv(file_path, usecols=columns)
#len(df_trials)

In [164]:
# load titles and description data into appropriate model index
from opensearchpy import helpers
df = pd.DataFrame({
    "Title": df_trials["Title"].tolist(),
    "Description": df_trials["Description"].tolist()
})

docs = df.to_dict(orient="records")

helpers.bulk(client, docs, index=index_name, raise_on_error=True, refresh=True)

(30, [])

#### Develop the hybrid search pipeline and search query
##### Since hybrid search is not available in the above OS client interface, http client interface is used

In [165]:
#Create OS objects with HttpClient (search pipelines for hybrid search not supported)
from opensearchpy import RequestsHttpConnection
from opensearchpy.client.http import HttpClient
import json, requests

# Define a custom HTTP client by extending the HttpClient class
class CustomHttpClient(HttpClient):
    def perform_request(self, method, url, params=None, body=None, headers=None):
        print(f"Performing {method} request to {url}")
        if body is not None:
            body = json.dumps(body)
        response = super().perform_request(method, url, params=params, body=body, headers=headers)
        return response

    # Function to pretty-print JSON responses
def print_response(response):
    response_body = response[1].decode('utf-8')
    print(json.dumps(json.loads(response_body), indent=2))

In [166]:
# Define the OpenSearch client using the custom HTTP client
client = OpenSearch(
    hosts=[{'host': '10.0.3.229', 'port': 9200}],
    http_auth=('osmaster', 'Welcome#1234!'),  # Replace with your username and password
    use_ssl = True,
    verify_certs = False,
    ssl_show_warn = False,
    connection_class=RequestsHttpConnection,
    http_compress=True,
    client_class=CustomHttpClient
)

In [167]:
# Perform a basic GET request to check the cluster health
response = client.transport.perform_request('GET', '/_cluster/health')
#response

In [168]:
# Define the hybrid search pipeline 
srch_pl_name='hybrid-search-pipeline'
srch_pl_body={
  "description": "Post processor for hybrid search",
  "phase_results_processors": [
    {
      "normalization-processor": {
        "normalization": {
          "technique": "min_max"
        },
        "combination": {
          "technique": "arithmetic_mean",
          "parameters": {
            "weights": [
              0.3,
              0.7
            ]
          }
        }
      }
    }
  ]
}

In [169]:
#Create the hybrid search pipeline
try:
    response=client.transport.perform_request('DELETE',f'/_search/pipeline/{srch_pl_name}')
    print(f'Search Pipeline {srch_pl_name} deleted successfully')
except Exception as e:
    pass
try:                                          
    response = client.transport.perform_request('PUT',f'/_search/pipeline/{srch_pl_name}',body=srch_pl_body)
    print(f'Search Pipeline {srch_pl_name} created successfully')
except Exception as e:
    print(f"Error creating pipeline: {e}")                                              

Search Pipeline hybrid-search-pipeline deleted successfully
Search Pipeline hybrid-search-pipeline created successfully


In [170]:
#User query for hybrid indexed search using the hybrid search pipeline
user_query = "Find all liver transplantation trials"
num_titles = 4
#model_id='1wvDKZEBWEIPcnamv7eS'

In [171]:
#Query body using the above hybrid search pipeline based on Description column
query_body={
  "_source": {
    "excludes": [
      "title_embeddings","description_embeddings"
    ]
  },
  "query": {
    "hybrid": {
      "queries": [
        {
          "match": {
            "Description": {
              "query": f'{user_query}'
            }
          }
        },
        {
          "neural": {
            "description_embeddings": {
              "query_text": f'{user_query}',
              "model_id": f'{model_id}',
              "k": f'{num_titles}'
            }
          }
        }
      ]
    }
  }
}

In [172]:
results = client.search(
    body=query_body,
    index=index_name,
    search_pipeline=pl_name
)
#results['hits']['hits'][0]['_source']['Title']

In [173]:
#num_records=len(results['hits']['hits'])
for i in range(0,num_titles):
    Title=results['hits']['hits'][i]['_source']['Title']
    Description=results['hits']['hits'][i]['_source']['Description']
    score = results['hits']['hits'][i]['_score']
    print(f"{i+1}. Title: {Title}, Score: {score}")

1. Title: A Prospective, Single-Arm Trial of an Innovative Liver Transplantation Technique, Score: 1.0
2. Title: A Randomized, Double-Blind, Placebo-Controlled Trial of 'Cirrinol' for Liver Cirrhosis Treatment, Score: 0.1641508
3. Title: Phase 2 Clinical Trial of an Investigational CAR T-Cell Therapy for Relapsed or Refractory B-Cell Acute Lymphoblastic Leukemia (B-ALL), Score: 0.10026089
4. Title: A Phase 3 Randomized Clinical Trial of an Investigational Drug for Newly Diagnosed Multiple Myeloma, Score: 0.097346365


In [174]:
#Query body using the above hybrid search pipeline based on Title column
query_body={
  "_source": {
    "excludes": [
      "title_embeddings","description_embeddings"
    ]
  },
  "query": {
    "hybrid": {
      "queries": [
        {
          "match": {
            "Title": {
              "query": f'{user_query}'
            }
          }
        },
        {
          "neural": {
            "title_embeddings": {
              "query_text": f'{user_query}',
              "model_id": f'{model_id}',
              "k": f'{num_titles}'
            }
          }
        }
      ]
    }
  }
}

In [175]:
results = client.search(
    body=query_body,
    index=index_name,
    search_pipeline=pl_name
)

In [176]:
for i in range(0,num_titles):
    Title=results['hits']['hits'][i]['_source']['Title']
    Description=results['hits']['hits'][i]['_source']['Description']
    score = results['hits']['hits'][i]['_score']
    print(f"{i+1}. Title: {Title}, Score: {score}")

1. Title: A Prospective, Single-Arm Trial of an Innovative Liver Transplantation Technique, Score: 1.0
2. Title: A Randomized, Open-Label Trial of Stem Cell Therapy for Acute Liver Failure, Score: 0.35522744
3. Title: A Randomized, Double-Blind, Placebo-Controlled Trial of 'Cirrinol' for Liver Cirrhosis Treatment, Score: 0.12978652
4. Title: Phase 2 Clinical Trial of an Investigational CAR T-Cell Therapy for Relapsed or Refractory B-Cell Acute Lymphoblastic Leukemia (B-ALL), Score: 0.05730572
