
-----

# **Source Code Analysis Experimentation**

-----

### **Import Libraries**

In [1]:
import os
from git import Repo
from dotenv import load_dotenv
from langchain.vectorstores import Chroma  # Updated import
from langchain.text_splitter import Language
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

### **Set up Environment Variables**

In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

### **OpenAI Model**

In [3]:
llm = ChatOpenAI()

### **Clone Repository**

In [4]:
repo_path = "test_repo/"

repo = Repo.clone_from("https://github.com/entbappy/End-to-end-Medical-Chatbot-Generative-AI", to_path=repo_path)

### **Split Python Code**

In [5]:
# Create a loader instance to load files from the specified filesystem path
loader = GenericLoader.from_filesystem(
    repo_path,               # The path to the repository from which files will be loaded
    glob="**/*",            # A glob pattern to match all files recursively in the directory
    suffixes=[".py"],       # Only include files with the .py suffix (Python files)
    parser=LanguageParser(   # Specify the parser to use for processing the files
        language=Language.PYTHON,  # Set the language for the parser to Python
        parser_threshold=500       # Set a threshold for the parser, possibly limiting processing for large files
    )
)

In [6]:
# Load documents using the previously created loader instance
document = loader.load()  # This calls the load method on the loader to retrieve the files and parse their content

In [7]:
len(document)

7

In [8]:
# Check first line
document[0].page_content

'from flask import Flask, render_template, jsonify, request\nfrom src.helper import download_hugging_face_embeddings\nfrom langchain_pinecone import PineconeVectorStore\nfrom langchain_openai import OpenAI\nfrom langchain.chains import create_retrieval_chain\nfrom langchain.chains.combine_documents import create_stuff_documents_chain\nfrom langchain_core.prompts import ChatPromptTemplate\nfrom dotenv import load_dotenv\nfrom src.prompt import *\nimport os\n\napp = Flask(__name__)\n\nload_dotenv()\n\nPINECONE_API_KEY=os.environ.get(\'PINECONE_API_KEY\')\nOPENAI_API_KEY=os.environ.get(\'OPENAI_API_KEY\')\n\nos.environ["PINECONE_API_KEY"] = PINECONE_API_KEY\nos.environ["OPENAI_API_KEY"] = OPENAI_API_KEY\n\nembeddings = download_hugging_face_embeddings()\n\n\nindex_name = "medicalbot"\n\n# Embed each chunk and upsert the embeddings into your Pinecone index.\ndocsearch = PineconeVectorStore.from_existing_index(\n    index_name=index_name,\n    embedding=embeddings\n)\n\nretriever = docsearc

### **Split Code Into Chunks**

In [9]:
# Create an instance of RecursiveCharacterTextSplitter to split text into manageable chunks
documents_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,  # Specify the language of the documents to be split (Python in this case)
    chunk_size=500,            # Set the maximum size of each chunk to 500 characters
    chunk_overlap=20           # Define the number of overlapping characters between consecutive chunks
)

In [10]:
# Split the loaded document into smaller chunks using the defined text splitter
texts = documents_splitter.split_documents(document)  # This method processes the document and returns a list of text chunks

### **Initialize Embedding Model**

In [11]:
embeddings = OpenAIEmbeddings(disallowed_special=())

### **Store Data in Chromadb**

In [12]:
# Create a vector database using the Chroma class to store document embeddings
vectordb = Chroma.from_documents(
    texts,                     # A list of documents (texts) to be processed and converted into embeddings
    embedding=embeddings,      # The embedding model to use for converting the texts into vector representations
    persist_directory='./db'   # Directory where the vector database will be stored persistently
)

In [13]:
vectordb.persist() 

  vectordb.persist()


In [14]:
# Create an instance of ConversationSummaryMemory to manage conversation history and summary
memory = ConversationSummaryMemory(
    llm=llm,                  # The language model (llm) used for generating summaries of the conversation
    memory_key="chat_history", # A key used to identify the conversation history within the memory
    return_messages=True      # Indicates whether to return the messages in the summary
)

  memory = ConversationSummaryMemory(


In [15]:
# Create a ConversationalRetrievalChain to facilitate question-answering with memory and retrieval capabilities
qa = ConversationalRetrievalChain.from_llm(
    llm,                                         # The language model (llm) used for generating answers to questions
    retriever=vectordb.as_retriever(            # Set up the retriever from the vector database for fetching relevant documents
        search_type="mmr",                      # Specify the search type; "mmr" stands for Maximum Marginal Relevance
        search_kwargs={"k": 8}                  # Set additional search parameters; "k" defines the number of top results to return (in this case, 8)
    ),
    memory=memory                                # Pass the memory object to retain conversation context and history
)

In [16]:
question = "what is download_hugging_face_embeddings funtion?"

In [17]:
result = qa(question)
print(result['answer'])

  result = qa(question)
Number of requested results 20 is greater than number of elements in index 13, updating n_results = 13


The `download_hugging_face_embeddings` function is a function that downloads embeddings from the Hugging Face model "sentence-transformers/all-MiniLM-L6-v2," which returns embeddings with 384 dimensions. These embeddings are used for natural language processing tasks like similarity calculations, clustering, or retrieval in the context of the application.


------

------