In [2]:
from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

In [3]:
%pwd

'c:\\NLP\\End-to-End-Source-Code-Analysis-Generative-AI-Project\\research'

In [26]:
!mkdir test_repo

In [27]:
repo_path = "test_repo/"

repo = Repo.clone_from("https://github.com/mehboobme/Medical_Chatbot.git", to_path = repo_path)

In [28]:
loader = GenericLoader.from_filesystem(repo_path,
                                       glob = "**/*",
                                       suffixes=[".py"],
                                       parser=LanguageParser(language=Language.PYTHON, parser_threshold=500))

In [29]:
documents = loader.load()
documents

[Document(metadata={'source': 'test_repo\\app.py', 'language': <Language.PYTHON: 'python'>}, page_content='from flask import Flask, request, jsonify, render_template\nfrom src.helper import download_hugging_face_embeddings\nfrom langchain_pinecone import PineconeVectorStore\nfrom langchain_openai import OpenAI\nfrom langchain.chains import create_retrieval_chain\nfrom langchain.chains.combine_documents import create_stuff_documents_chain\nfrom langchain_core.prompts import ChatPromptTemplate\nfrom dotenv import load_dotenv\nfrom src.prompt import prompt\nimport os\n\napp = Flask(__name__)\n\nload_dotenv()\n\nPINECONE_API_KEY= os.environ.get("PINECONE_API_KEY")\nOPENAI_API_KEY=os.environ.get("OPENAI_API_KEY")\n\nembeddings = download_hugging_face_embeddings()\n\nindex_name = "medicalbot"\n\n#Embed each chunk and upsert the embeddings into your Pinecone index\ndocsearch = PineconeVectorStore.from_existing_index(\n    index_name=index_name,\n    embedding = embeddings\n)\n\n# define retri

In [30]:
len(documents)

7

In [31]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON,
                                                                  chunk_size = 500,
                                                                  chunk_overlap = 20)

In [32]:
texts = documents_splitter.split_documents(documents)
texts

[Document(metadata={'source': 'test_repo\\app.py', 'language': <Language.PYTHON: 'python'>}, page_content='from flask import Flask, request, jsonify, render_template\nfrom src.helper import download_hugging_face_embeddings\nfrom langchain_pinecone import PineconeVectorStore\nfrom langchain_openai import OpenAI\nfrom langchain.chains import create_retrieval_chain\nfrom langchain.chains.combine_documents import create_stuff_documents_chain\nfrom langchain_core.prompts import ChatPromptTemplate\nfrom dotenv import load_dotenv\nfrom src.prompt import prompt\nimport os\n\napp = Flask(__name__)\n\nload_dotenv()'),
 Document(metadata={'source': 'test_repo\\app.py', 'language': <Language.PYTHON: 'python'>}, page_content='load_dotenv()\n\nPINECONE_API_KEY= os.environ.get("PINECONE_API_KEY")\nOPENAI_API_KEY=os.environ.get("OPENAI_API_KEY")\n\nembeddings = download_hugging_face_embeddings()\n\nindex_name = "medicalbot"\n\n#Embed each chunk and upsert the embeddings into your Pinecone index\ndocse

In [33]:
len(texts)

13

In [34]:
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [35]:
embeddings= OpenAIEmbeddings(disallowed_special=())

In [36]:
vectorDB= Chroma.from_documents(texts, embedding=embeddings, persist_directory='db')

In [37]:
vectorDB.persist()

In [38]:
# use llm
llm = ChatOpenAI()

In [39]:
memory = ConversationSummaryMemory(llm=llm, memory_key='chat_history', return_messages=True)

In [40]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectorDB.as_retriever(search_type='mmr', search_kwargs={"k":8}))

In [53]:
question = "what is langchain?"

In [54]:
chat_history=[]
result = qa({"question": question, "chat_history": chat_history})
print(result['answer'])

Langchain is a framework that combines different AI models and tools to perform complex natural language processing tasks. It includes components like retrievers for accessing relevant information, language models for generating answers, and chains that link these components together to provide accurate and efficient question-answering capabilities.
