----

# **Medical Chatbot Assistant**

----

### **Import Libraries**

In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

  from tqdm.autonotebook import tqdm


### **Set up Environment Variables**

In [2]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')
api_key = os.getenv("PINECONE_API_KEY")

### **Load Data**

In [3]:
def load_pdf_file(directory_path):
    # Create a DirectoryLoader instance to load PDF files from the specified directory.
    loader = DirectoryLoader(directory_path, 
                             glob="*.pdf",  # Specify that we want to load files with a .pdf extension
                             loader_cls=PyPDFLoader)  # Use PyPDFLoader to handle the loading of PDF files
    
    # Load the documents (PDF files) using the loader
    documents = loader.load()
    
    # Return the loaded documents
    return documents

- This book consist of more than 700 pages so it is going to take some time.

In [4]:
directory_path = r"E:\Medical Chatbot\data"

extracted_data = load_pdf_file(directory_path)

### **Split the Data Into Chunks**

In [5]:
def text_split(extracted_data):
    # Create an instance of RecursiveCharacterTextSplitter to split text into manageable chunks.
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,    # Set the maximum size of each chunk to 500 characters
        chunk_overlap=20,   # Allow an overlap of 20 characters between consecutive chunks
    )
    
    # Split the extracted data into text chunks using the defined splitter
    text_chunks = text_splitter.split_documents(extracted_data)
    
    # Return the list of text chunks
    return text_chunks

In [6]:
text_chunks = text_split(extracted_data)
print(f"Length of the Chunks: {len(text_chunks)}")

Length of the Chunks: 7020


### **Load Embedding Model**

In [7]:
def download_huggingface_embedding():
    # Create an instance of the updated HuggingFaceEmbeddings class.
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding

In [8]:
embedding = download_huggingface_embedding()

In [9]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

#### **Let's Try to Convert A sentence into embedding**

In [10]:
querry_result = embedding.embed_query("Hello World")
print(f"The Length of Embeddings is {len(querry_result)}.")

The Length of Embeddings is 384.


### **Setup Pinecone Vector DB**

In [11]:
# Define the name of the index to be used for the Pinecone vector database.
index_name = "medical-chatbot-implementation"

# Initialize a Pinecone client using the provided API key.
pc = Pinecone(api_key=api_key)

# Check if the specified index already exists in the Pinecone environment.
if index_name not in pc.list_indexes().names():
    # If the index does not exist, create a new index with the specified parameters.
    pc.create_index(
        name=index_name,            # Set the name of the index
        dimension=384,              # Specify the dimensionality of the vectors stored in the index
        metric="cosine",            # Use cosine similarity as the distance metric
        spec=ServerlessSpec(        # Define the serverless specification for the index
            cloud="aws",            # Specify the cloud provider
            region="us-east-1"      # Specify the region for the index
        )
    )

### **Upsert Embeddings in Pinecone**

- Embed each chunk and upsert the embeddings into your Pinecone index.

In [12]:
# Create a PineconeVectorStore instance from the provided documents and embedding model.
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,  # The list of text chunks to be stored in the vector store.
    index_name=index_name,  # The name of the index where the vectors will be stored.
    embedding=embedding,     # The embedding model used to convert documents into vector representations.
)

In [13]:
# def batch_upsert(index, text_chunks, embedding, batch_size=100):
#     # Iterate over the embeddings in batches of specified size.
#     for i in tqdm(range(0, len(embedding), batch_size)):
#         # Determine the end index for the current batch.
#         i_end = min(i + batch_size, len(embedding))
        
#         # Create a batch of tuples containing the ID, embedding vector, and associated text.
#         batch = list(zip(
#             [str(j) for j in range(i, i_end)],  # Generate a list of string IDs for each embedding
#             embedding[i:i_end],                 # Get the current batch of embedding vectors
#             [{"text": chunk.page_content} for chunk in text_chunks[i:i_end]]  # Extract the text content for each chunk
#         ))
        
#         # Upsert (update or insert) the batch of vectors into the specified index.
#         index.upsert(vectors=batch)

# # Initialize the index using the specified index name.
# index = pc.Index(index_name)

# # Call the batch_upsert function to insert the text chunks and their embeddings into the index.
# batch_upsert(index, text_chunks, embeddings_list)

### **Create a PineconeVectorStore instance using an existing index**

In [14]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,  # The name of the existing index to connect to.
    embedding=embedding,     # The embedding model used to convert documents into vector representations.
)