## LLM RAG (Retrieval Augmented Generation) application that can load and query GitHub repository

In [1]:
# [REQUEST]
#   Create LLM Rag APP asking GitHub repo url?
# [RESPONSE]
#   I'll help you create a LLM RAG (Retrieval Augmented Generation) application
#   that can load and query a GitHub repository. Based on the LangChain documentation,
#   here's a comprehensive example:

# [Install required packages]
# pip install langchain langchain-community langchain-openai chromadb

In [2]:
import os
from langchain_community.document_loaders import GitLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

### Set up OpenAI API Key

In [3]:
# Set up OpenAI API Key
# if "OPENAI_API_KEY" not in os.environ:
#     os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")

In [4]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
# openai_api_key = os.environ["OPENAI_API_KEY"]

# load_dotenv(override = True)
# api_key = os.getenv('OPENAI_API_KEY')

MODEL_GPT="gpt-4o-mini"

### Setup steps

In [5]:
repo_url = "https://github.com/pavelklos/ai-llm"
repo_path = "ai-llm-repo"  # Local path to clone repo

# Load only .py files from '/LangChain/Chatbot/' folder
# ai-llm-repo/LangChain/Chatbot/chatbot-database.py
# ai-llm-repo/LangChain/Chatbot/chatbot-github.py
# ai-llm-repo/LangChain/Chatbot/chatbot-rag-pdf-faiss.py
def is_valid_file(file_path):
    file_path_replaced = file_path.replace('\\', '/')
    # print(file_path_replaced)
    starts_with = f"{repo_path}/LangChain/Chatbot/"
    ends_with = ".py"
    if (file_path_replaced.startswith(starts_with) and file_path_replaced.endswith(ends_with)):
        print(file_path_replaced)
        return True
    return False

loader = GitLoader(
    clone_url=repo_url,
    repo_path=repo_path,
    branch="main",
    file_filter=is_valid_file  # Callable[[str], bool]
)

In [6]:
documents = loader.load()

ai-llm-repo/LangChain/Chatbot/chatbot-database.py
ai-llm-repo/LangChain/Chatbot/chatbot-github.py
ai-llm-repo/LangChain/Chatbot/chatbot-rag-pdf-faiss.py


In [7]:
print(len(documents))

3


In [8]:
for doc in documents:
    print(f"Loaded: {doc.metadata['source']}")
    # print(f"Loaded: {doc.metadata['file_path']}")
    print(f"{doc.page_content[:200]}\n")  # Print first 200 characters for preview

Loaded: LangChain\Chatbot\chatbot-database.py
# [REQUEST]
#   Create sample code to query SQLite database by LLM, LangChain and SQLDatabaseChain?
# [RESPONSE]
#   I'll provide a comprehensive example of querying a SQLite database using
#   La

Loaded: LangChain\Chatbot\chatbot-github.py
# [REQUEST]
#   Create LLM Rag APP asking GitHub repo url?
# [RESPONSE]
#   I'll help you create a LLM RAG (Retrieval Augmented Generation) application
#   that can load and query a GitHub reposit

Loaded: LangChain\Chatbot\chatbot-rag-pdf-faiss.py
# [REQUEST]
#   Create basic RAG App with PDF locally and FAISS vector store, save and load vector store?
# [RESPONSE]
#   I'll provide a comprehensive example of creating a RAG application with a 



In [9]:
embeddings = OpenAIEmbeddings()

In [10]:
vectorstore = Chroma.from_documents(
    documents=documents, 
    embedding=embeddings
)

In [11]:
# retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
retriever = vectorstore.as_retriever(search_kwargs={"k": len(documents)})

In [12]:
llm = ChatOpenAI(model=MODEL_GPT)

In [13]:
prompt = ChatPromptTemplate.from_template("""
You are an expert code and documentation assistant. 
Answer the question based ONLY on the following context:

{context}

Question: {question}

If the information is not in the context, say "I cannot find the answer in the repository."
""")

In [14]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

### Example usage

In [15]:
questions = [
    "Summarize the code in each document, shortly (200 words)",
    # ""
]
    
for question in questions:
    print(f"\nQuestion: {question}")
    answer = rag_chain.invoke(question)
    print(f"Answer: {answer}")


Question: Summarize the code in each document, shortly (200 words)
Answer: 1. **chatbot-rag-pdf-faiss.py**: This script implements a Retrieval-Augmented Generation (RAG) application that loads a PDF document, processes it, and utilizes a FAISS vector store for efficient querying. The process includes loading a PDF, splitting its content into manageable chunks, creating a vector store from these chunks using OpenAI embeddings, and providing functionality to save and load the vector store locally. The main execution flow sets up the OpenAI API key, processes a specified PDF, saves the vector store, and then demonstrates querying the document via a RAG chain that retrieves relevant context and generates answers based on user queries.

2. **chatbot-github.py**: This code outlines the creation of a RAG application that queries a GitHub repository. It involves cloning the repository, loading relevant documents (e.g., Python files, markdown), and creating embeddings. A Chroma vector store is

## [chatbot-github.py]

In [16]:
# # Required imports
# import os
# from langchain_community.document_loaders import GitLoader
# from langchain_openai import OpenAIEmbeddings
# from langchain_community.vectorstores import Chroma
# from langchain_openai import ChatOpenAI
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.runnables import RunnablePassthrough
# from langchain_core.output_parsers import StrOutputParser

# # Setup steps
# def create_github_rag_app(repo_url, branch="master"):
#     # 1. Clone the repository
#     repo_path = "./example_data/github_repo/"
    
#     # Load repository
#     loader = GitLoader(
#         clone_url=repo_url,
#         repo_path=repo_path,
#         branch=branch,
#         # Optional: filter only specific file types
#         file_filter=lambda file_path: file_path.endswith((".py", ".md", ".txt"))
#     )
    
#     # 2. Load documents
#     documents = loader.load()
    
#     # 3. Create embeddings
#     embeddings = OpenAIEmbeddings()
    
#     # 4. Create vector store
#     vectorstore = Chroma.from_documents(
#         documents=documents, 
#         embedding=embeddings
#     )
    
#     # 5. Create retriever
#     retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    
#     # 6. Create LLM
#     llm = ChatOpenAI(model="gpt-3.5-turbo")
    
#     # 7. Create prompt template
#     prompt = ChatPromptTemplate.from_template("""
#     You are an expert code and documentation assistant. 
#     Answer the question based ONLY on the following context:
    
#     {context}
    
#     Question: {question}
    
#     If the information is not in the context, say "I cannot find the answer in the repository."
#     """)
    
#     # 8. Create RAG chain
#     rag_chain = (
#         {"context": retriever, "question": RunnablePassthrough()}
#         | prompt
#         | llm
#         | StrOutputParser()
#     )
    
#     return rag_chain

# # Example usage
# def main():
#     # Set your OpenAI API key
#     os.environ["OPENAI_API_KEY"] = "your-openai-api-key"
    
#     # Create RAG app for a specific GitHub repo
#     repo_url = "https://github.com/langchain-ai/langchain"
#     rag_app = create_github_rag_app(repo_url)
    
#     # Ask questions
#     questions = [
#         "What is LangChain used for?",
#         "Explain the main components of LangChain",
#         "How do I create a basic RAG application?"
#     ]
    
#     for question in questions:
#         print(f"\nQuestion: {question}")
#         answer = rag_app.invoke(question)
#         print(f"Answer: {answer}")

# if __name__ == "__main__":
#     main()
