# Introduction

This notebook uses the embeddings to create a search engine. This notebook shows how to prepare a search to understand natural language and return relevant results. In the next notebook, we will use this tto enhance the response from the large language model. 

In [11]:
# Import basic computation libraries
import pandas as pd 

## vector database search
from qdrant_client import models, QdrantClient

## vector computing framework
from sentence_transformers import SentenceTransformer

# tensor computation library
from torch import mps

## Data Processing

Load the data and remove null values.

In [17]:
## Load 'Patient0level Clinical Drug Trial Data' 
df = pd.read_csv('./data/Bells_Palsy_Clinical_Trial.csv')

In [16]:
## Check if any of the cells are empty. Missing values cause errors in LLM. We will remove them before processing further
# Count empty cells in each column
print(df.isnull().sum())

Patient ID                                               0
Sex                                                      0
Age                                                      0
Baseline Score on House–Brackmann scale                  0
Time between onset of symptoms and start of treatment    0
Treatment Group                                          0
Received Prednisolone                                    0
Received Acyclovir                                       0
3-Month Score on House–Brackmann scale                   0
Full Recovery in 3 Months                                0
9-Month Score on House–Brackmann scale                   0
Full Recovery in 9 Months                                0
dtype: int64


In [19]:
## dataset stats like total count and data field distributions (std/mean)
df.describe()

Unnamed: 0,Patient ID,Age,Baseline Score on House–Brackmann scale,3-Month Score on House–Brackmann scale,9-Month Score on House–Brackmann scale
count,494.0,494.0,494.0,494.0,494.0
mean,247.5,44.868421,3.680162,1.340081,1.143725
std,142.749781,14.550357,1.131752,0.609037,0.46105
min,1.0,16.0,2.0,1.0,1.0
25%,124.25,34.0,3.0,1.0,1.0
50%,247.5,44.0,4.0,1.0,1.0
75%,370.75,55.0,4.0,2.0,1.0
max,494.0,90.0,6.0,4.0,4.0


In [24]:
## Maps data fields to the format needed for vectorisation
data = df.to_dict('records')

## Process Embeddings 
Embeddings are representation of the text data (in our case the wine csv file) as vectors in a high-dimentional space. We use embeddings to be able to complare the simarify between sentences. Vectors allow us to represent the text in matematical terms. In this notebook, I use cosine similarify that allows to compute and measure the cosine of the angle between two vectors, effectively quantifying how similar two sentences regardless of their lenght. 

In [21]:
## encode using the 'all-MiniLM-L6-v2' model. 
encoder = SentenceTransformer('all-MiniLM-L6-v2') # model: download ML model locally

## database to store the vectors. Since the data is in a small size, we can store the data in memory. 
qdrant = QdrantClient(":memory:")

In [27]:
# create a collection that will be stored in the database. The collection stored the params 
# size: takes the size from the input data
# distance function: cosine

qdrant.recreate_collection(
    collection_name = "patient_level_ct",
    vectors_config = models.VectorParams(
        size = encoder.get_sentence_embedding_dimension(),
        distance = models.Distance.COSINE
    )
)

  qdrant.recreate_collection(


True

In [28]:
# creates an index and uploads all the data into the in-memory database
# payload holds the metadata 
qdrant.upload_points(
    collection_name = "patient_level_ct",
    points = [
        models.PointStruct(
            id = idx,
            vector = encoder.encode(doc['Treatment Group']).tolist(),
            payload = doc
        ) 
        for idx, doc in enumerate(data)
    ]
)

## Search with given input text

Let's search! 

In [34]:
user_prompt = "What is the recovery rate of patients that received Acyclovir?"
hits = qdrant.search(
    collection_name = "patient_level_ct",
    query_vector = encoder.encode(user_prompt).tolist(),
    limit = 5
)
for hit in hits:
    print(hit.payload, "score:", hit.score)

{'Patient ID': 372, 'Sex': 'Male', 'Age': 35, 'Baseline Score on House–Brackmann scale': 6, 'Time between onset of symptoms and start of treatment': '>24 to ≤48 hr', 'Treatment Group': 'Acyclovir–Placebo', 'Received Prednisolone': 'No', 'Received Acyclovir': 'Yes', '3-Month Score on House–Brackmann scale': 2, 'Full Recovery in 3 Months': 'No', '9-Month Score on House–Brackmann scale': 1, 'Full Recovery in 9 Months': 'Yes'} score: 0.6791415876624874
{'Patient ID': 309, 'Sex': 'Female', 'Age': 60, 'Baseline Score on House–Brackmann scale': 4, 'Time between onset of symptoms and start of treatment': '>24 to ≤48 hr', 'Treatment Group': 'Acyclovir–Placebo', 'Received Prednisolone': 'No', 'Received Acyclovir': 'Yes', '3-Month Score on House–Brackmann scale': 1, 'Full Recovery in 3 Months': 'Yes', '9-Month Score on House–Brackmann scale': 1, 'Full Recovery in 9 Months': 'Yes'} score: 0.6791415876624874
{'Patient ID': 310, 'Sex': 'Male', 'Age': 35, 'Baseline Score on House–Brackmann scale': 2,

In [32]:
search_result = [hit.payload for hit in hits]

In [36]:
## Connect to LLM from OpenAI 
from openai import OpenAI

client = OpenAI(
    base_url = "http://127.0.0.1:8080/v1",
    api_key = "sk_no_key_required"
)
completion = client.chat.completions.create(
    model = "LLaMA_CPP",
    messages = [
        {"role": "system", "content": "I am chatbot that can tell you about patient level clinical trial results"},
        {"role": "user", "content": "What is the recovery rate of patients that received Acyclovir?"},
        {"role": "assistant", "content": str(search_result)}
    ]
)

APIConnectionError: Connection error.