<h1>IMPORT LIBRARY</h1>

In [1]:
#Import Library
import pandas as pd
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
import tensorflow as tf
from transformers import TFAutoModel
from transformers import AutoTokenizer
import time

2024-11-26 08:01:55.184548: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-26 08:01:55.425904: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-26 08:01:55.479591: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-26 08:01:56.263656: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

<h1>DATA CLEANING</h1>

In [2]:
dataset = pd.read_csv('medquad.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16412 entries, 0 to 16411
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   question    16412 non-null  object
 1   answer      16407 non-null  object
 2   source      16412 non-null  object
 3   focus_area  16398 non-null  object
dtypes: object(4)
memory usage: 513.0+ KB


In [3]:
dataset.head()

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


In [4]:
print("number of duplications : ", dataset.duplicated().sum())

number of duplications :  48


In [5]:
dataset.drop_duplicates(inplace=True)
print("number of duplications after cleaning : ", dataset.duplicated().sum())

number of duplications after cleaning :  0


In [6]:
dataset.isna().sum()

question       0
answer         5
source         0
focus_area    14
dtype: int64

In [7]:
print('number of NaN : '), dataset.dropna(inplace=True)

number of NaN : 


(None, None)

In [8]:
#Save Dataset
dataset.to_csv('cleaned_medquad.csv', index=False)

<h1>TOKENIZATION & EMBEDDING</h1>

In [9]:
class TFSentenceTransformer(tf.keras.layers.Layer):
    def __init__(self, model_name_or_path, **kwargs):
        super(TFSentenceTransformer, self).__init__()
        #Load transformers model
        self.model = TFAutoModel.from_pretrained(model_name_or_path, **kwargs)

    def call(self, inputs, normalize=True):
        #Run model on inputs
        model_output = self.model(inputs)
        #Perform pooling.
        embeddings = self.mean_pooling(model_output, inputs['attention_mask'])
        #Normalize the embeddings
        if normalize:
            embeddings = self.normalize(embeddings)
        return embeddings

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = tf.cast(
            tf.broadcast_to(tf.expand_dims(attention_mask, -1), tf.shape(token_embeddings)),
            tf.float32
        )
        return tf.math.reduce_sum(token_embeddings * input_mask_expanded, axis=1) / tf.clip_by_value(tf.math.reduce_sum(input_mask_expanded, axis=1), 1e-9, tf.float32.max)

    def normalize(self, embeddings):
        embeddings, _ = tf.linalg.normalize(embeddings, 2, axis=1)
        return embeddings

In [10]:
#Model ID
model_id = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
#Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = TFSentenceTransformer(model_id)

2024-11-26 08:02:03.268963: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:
2024-11-26 08:02:03.269019: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-11-26 08:02:03.269854: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropria

In [11]:
#Combine question and answer columns
dataset['question_answer'] = dataset['question'].fillna('') + ' ' + dataset['answer'].fillna('')

batch_size = 32

#Function to process the data in batches
def process_in_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

#Create a list from the question_answer column
qa = dataset['question_answer'].tolist()

#Tokenization
tokenized_qa = tokenizer(qa, padding=True, truncation=True, return_tensors='tf')

qa_dataset = tf.data.Dataset.from_tensor_slices(tokenized_qa)
qa_dataset = qa_dataset.batch(batch_size)
qa_dataset = qa_dataset.prefetch(tf.data.AUTOTUNE)

#Get embeddings from the model
all_embeddings = []
batch_num = 1

#Start measuring processing time
start_time = time.time()

for batch in qa_dataset:
    batch_embeddings = model(batch)
    embeddings_list = [embedding.numpy().tolist() for embedding in batch_embeddings]
    all_embeddings.extend(embeddings_list)
    #Displays the results of the batch being processed
    print(f"Uploaded Batch {batch_num}")
    batch_num += 1

#Calculates total time
total_time = time.time() - start_time
print(f"Total Processing Time: {total_time:.2f} seconds")

Uploaded Batch 1
Uploaded Batch 2
Uploaded Batch 3
Uploaded Batch 4
Uploaded Batch 5
Uploaded Batch 6
Uploaded Batch 7
Uploaded Batch 8
Uploaded Batch 9
Uploaded Batch 10
Uploaded Batch 11
Uploaded Batch 12
Uploaded Batch 13
Uploaded Batch 14
Uploaded Batch 15
Uploaded Batch 16
Uploaded Batch 17
Uploaded Batch 18
Uploaded Batch 19
Uploaded Batch 20
Uploaded Batch 21
Uploaded Batch 22
Uploaded Batch 23
Uploaded Batch 24
Uploaded Batch 25
Uploaded Batch 26
Uploaded Batch 27
Uploaded Batch 28
Uploaded Batch 29
Uploaded Batch 30
Uploaded Batch 31
Uploaded Batch 32
Uploaded Batch 33
Uploaded Batch 34
Uploaded Batch 35
Uploaded Batch 36
Uploaded Batch 37
Uploaded Batch 38
Uploaded Batch 39
Uploaded Batch 40
Uploaded Batch 41
Uploaded Batch 42
Uploaded Batch 43
Uploaded Batch 44
Uploaded Batch 45
Uploaded Batch 46
Uploaded Batch 47
Uploaded Batch 48
Uploaded Batch 49
Uploaded Batch 50
Uploaded Batch 51
Uploaded Batch 52
Uploaded Batch 53
Uploaded Batch 54
Uploaded Batch 55
Uploaded Batch 56
U

<h1>QDRANT</h1>

In [12]:
#Database Initialization
client = QdrantClient("http://34.101.137.149:6333")

In [13]:
#Input Data to Qdrant
client.recreate_collection(
    collection_name='Healthcare',
    vectors_config=VectorParams(
        size=(len(all_embeddings[0])),
        distance=Distance.COSINE
    )
)

points = [
    PointStruct(
        id=i,
        vector=all_embeddings[i],
        payload={"question" : dataset['question'].iloc[i], 'answer' : dataset['answer'].iloc[i]}
    )
    for i in range(len(all_embeddings))
]

batch_size = 500

#Split data to smaller batches
for i in range(0, len(points), batch_size):
    batch_points = points[i:i+batch_size]
    
    client.upsert(
        collection_name='Healthcare',
        wait=True,
        points=batch_points
    )
    print(f'Uploaded batch {i // batch_size + 1}')

  client.recreate_collection(


Uploaded batch 1
Uploaded batch 2
Uploaded batch 3
Uploaded batch 4
Uploaded batch 5
Uploaded batch 6
Uploaded batch 7
Uploaded batch 8
Uploaded batch 9
Uploaded batch 10
Uploaded batch 11
Uploaded batch 12
Uploaded batch 13
Uploaded batch 14
Uploaded batch 15
Uploaded batch 16
Uploaded batch 17
Uploaded batch 18
Uploaded batch 19
Uploaded batch 20
Uploaded batch 21
Uploaded batch 22
Uploaded batch 23
Uploaded batch 24
Uploaded batch 25
Uploaded batch 26
Uploaded batch 27
Uploaded batch 28
Uploaded batch 29
Uploaded batch 30
Uploaded batch 31
Uploaded batch 32
Uploaded batch 33


In [14]:
def search(query):
    # Tokenize query
    query_vector = tokenizer(query, padding=True, truncation=True, return_tensors="tf")

    # Generate embeddings using the model
    query_vector = model(query_vector).numpy().tolist()

    # Perform search in Qdrant
    results = client.search(
        collection_name='Healthcare',
        query_vector=query_vector[0],  # Use the first embedding in the batch
        limit=3
    )

    # Sort results by score
    sorted_result = sorted(results, key=lambda x: x.score, reverse=True)

    # Return formatted results
    return [res.payload['question'] + ' ' + res.payload['answer'] for res in sorted_result]

query = 'I have blurred vision, eye pain and redness, seeing flashes of light. What disease do I suffer from?'
results = search(query)
for result in results:
    print(result)

What is (are) Coats disease ? Coats disease is an eye disorder characterized by abnormal development of the blood vessels in the retina (retinal telangiectasia). Most affected people begin showing symptoms of the condition in childhood. Early signs and symptoms vary but may include vision loss, crossed eyes (strabismus), and a white mass in the pupil behind the lens of the eye (leukocoria). Overtime, coats disease may also lead to retinal detachment, glaucoma, and clouding of the lens of the eye (cataracts) as the disease progresses. In most cases, only one eye is affected (unilateral). The exact underlying cause is not known but some cases may be due to somatic mutations in the NDP gene. Treatment depends on the symptoms present and may include cryotherapy, laser therapy, and/or surgery.
What is (are) Eales disease ? Eales disease is a rare vision disorder that appears as an inflammation and white haze around the outercoat of the veins in the retina. This condition is most common amon