<a href="https://colab.research.google.com/github/parky-sood/codebase-rag/blob/main/RAG_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG Process Overview

![Screenshot 2024-11-25 at 7 12 58 PM](https://github.com/user-attachments/assets/0bd67cf0-43d5-46d2-879c-a752cae4c8e3)

# Library Dependencies

In [None]:
! pip install pygithub langchain langchain-community openai tiktoken pinecone-client langchain_pinecone sentence-transformers tree-sitter



In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
from pinecone import Pinecone
import os
import tempfile
from github import Github, Repository
from git import Repo
from openai import OpenAI
from pathlib import Path
from langchain.schema import Document
from pinecone import Pinecone
import re
import ast


# Clone Repo Locally

In [None]:
def clone_repo(repo_url):
  repo_name = repo_url.split("/")[-1]
  repo_path = f"/content/{repo_name}"
  Repo.clone_from(repo_url, repo_path)
  return str(repo_path)

In [None]:
SUPPORTED_EXTENSIONS = {'.py', '.js', '.tsx', '.jsx', '.ipynb', '.java',
                         '.cpp', '.ts', '.go', '.rs', '.vue', '.swift', '.c', '.h'}


IGNORED_DIRS = {'node_modules', 'venv', 'env', 'dist', 'build', '.git',
                '__pycache__', '.next', '.vscode', 'vendor'}

# Get file content using relative path from repo root

In [None]:
def get_function_py(content):
  tree = ast.parse(content)

  functions = []

  for node in ast.walk(tree):
    if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
      header = f"def {node.name}("
      header += ", ".join(arg.arg for arg in node.args.args)
      header += "):"

      body = ast.get_source_segment(content, node)

      functions.append({"header": header, "body": body})

  return functions


In [None]:
def get_file_content(file_path, repo_path, file_extension):
  try:
    contents = []
    with open(file_path, "r", encoding="utf-8") as f:
      content = f.read()

      rel_path = os.path.relpath(file_path, repo_path)

      if file_extension == ".py":
        functions = get_function_py(content)
      else:
        return {
            "name": rel_path,
            "function": None,
            "content": content
        }

      for item in functions:
        func_header, func_body = item["header"], item["body"]
        contents.append({"name": rel_path, "function": func_header, "content": func_body})

      return contents

  except Exception as e:
    print(f"Error reading file {file_path}: {e}")
    return None

def get_main_files_content(repo_path: str):
  """
  Get content of supported code files from local repository.

  Args:
    repo_path: Path to local repo

  Returns:
    List of dictionaries containing file names and contents
  """

  files_content = []

  try:
    for root, _, files in os.walk(repo_path):
      # Skip if current directory is in ignored directories

      if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
        continue

      # Process each file in current directory
      for file in files:
        file_path = os.path.join(root, file)

        file_extension = os.path.splitext(file)[1]

        if file_extension in SUPPORTED_EXTENSIONS:
          file_content = get_file_content(file_path, repo_path, file_extension)

          if file_content:
            if isinstance(file_content, dict):
              files_content.append(file_content)

            else:
              for content in file_content:
                files_content.append(content)

  except Exception as e:
    print(f"Error reading repository: {str(e)}")

  return files_content


In [None]:
file_content = get_main_files_content(path)

In [None]:
file_content

[{'name': 'src/prompts.ts',
  'function': None,
  'content': 'import { encode, encodeChat } from "gpt-tokenizer";\nimport type { ChatCompletionMessageParam } from "groq-sdk/resources/chat/completions";\nimport type { PRFile } from "./constants";\nimport {\n  rawPatchStrategy,\n  smarterContextPatchStrategy,\n} from "./context/review";\nimport { GROQ_MODEL, type GroqChatModel } from "./llms/groq";\n\nconst ModelsToTokenLimits: Record<GroqChatModel, number> = {\n  "mixtral-8x7b-32768": 32768,\n  "gemma-7b-it": 32768,\n  "llama3-70b-8192": 8192,\n  "llama3-8b-8192": 8192,\n};\n\nexport const REVIEW_DIFF_PROMPT = `You are PR-Reviewer, a language model designed to review git pull requests.\nYour task is to provide constructive and concise feedback for the PR, and also provide meaningful code suggestions.\n\nExample PR Diff input:\n\'\n## src/file1.py\n\n@@ -12,5 +12,5 @@ def func1():\ncode line that already existed in the file...\ncode line that already existed in the file....\n-code line t

# Embeddings

In [None]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    return model.encode(text)

In [None]:
text = "I am a software developer"

embeddings = get_huggingface_embeddings(text)

In [None]:
embeddings

array([ 2.99871378e-02, -3.49624036e-03, -5.04141226e-02, -1.71993636e-02,
        2.91443281e-02,  5.41470340e-03,  2.40509808e-02, -4.93179560e-02,
        3.84774548e-03,  7.70971470e-04,  4.75548804e-02,  2.41776858e-03,
        4.31840457e-02,  6.98979497e-02,  5.69172241e-02,  2.79243593e-03,
        8.82731527e-02, -5.15238568e-02, -1.64406952e-02, -9.74418037e-03,
       -2.47234083e-03,  3.34711112e-02, -4.13725749e-02, -5.85587288e-04,
       -6.20065890e-02, -3.46554480e-02,  2.37078834e-02,  4.14449431e-04,
        1.73848569e-02,  9.77241918e-02,  1.75682064e-02, -2.84734219e-02,
       -1.97771229e-02, -1.85787752e-02,  1.77097274e-06,  1.31393252e-02,
       -1.40710464e-02,  3.42960618e-02, -4.63347547e-02,  3.13603133e-02,
        3.60352024e-02,  5.10438196e-02, -3.50978202e-03,  6.25244826e-02,
        1.27911661e-02,  1.77411048e-03,  6.79705590e-02,  1.44812288e-02,
        1.27403915e-03,  2.58476529e-02, -7.06234109e-03, -4.50194022e-03,
       -2.00529415e-02, -

# Using Pinecone
**1. Create an account on [Pinecone.io](https://app.pinecone.io/)**

**2. Create a new index called "codebase-rag" and set the dimensions to 768. Leave the rest of the settings as they are.**

![Screenshot 2024-11-24 at 10 58 50 PM](https://github.com/user-attachments/assets/f5fda046-4087-432a-a8c2-86e061005238)



**3. Create an API Key for Pinecone**

![Screenshot 2024-11-24 at 10 44 37 PM](https://github.com/user-attachments/assets/e7feacc6-2bd1-472a-82e5-659f65624a88)


**4. Store your Pinecone API Key within Google Colab's secrets section, and then enable access to it (see the blue checkmark)**

![Screenshot 2024-11-24 at 10 45 25 PM](https://github.com/user-attachments/assets/eaf73083-0b5f-4d17-9e0c-eab84f91b0bc)



In [None]:
# Set the PINECONE_API_KEY as an environment variable
pinecone_api_key = userdata.get("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get("PINECONE_API_KEY"),)

# Connect to your Pinecone index
pinecone_index = pc.Index("codebase-rag")

In [None]:
vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())

  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())


In [None]:
documents = []

for file in file_content:
    if file['function']:
      doc = Document(
          page_content=f"{file['name']}\n{file['content']}",
          metadata={"source": file['name'], "function": file['function']}
      )
    else:
      doc = Document(
        page_content=f"{file['name']}\n{file['content']}",
        metadata={"source": file['name']}
    )

    documents.append(doc)


vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=HuggingFaceEmbeddings(),
    index_name="codebase-rag",
    namespace="https://github.com/parky-sood/ai-pr-reviews"
)

  embedding=HuggingFaceEmbeddings(),


In [None]:
### Change the vectorization from entire files to only functions, and feed it functions so that the RAG is less "clouded" in specificity

# Perform RAG

1. Get your Groq API Key [here](https://console.groq.com/keys)

2. Paste your Groq API Key into your Google Colab secrets, and make sure to enable permissions for it

![Screenshot 2024-11-25 at 12 00 16 AM](https://github.com/user-attachments/assets/e5525d29-bca6-4dbd-892b-cc770a6b281d)


In [None]:
client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=userdata.get("GROQ_API_KEY")
)

In [None]:
query = "How are python files parsed?"

In [None]:
raw_query_embedding = get_huggingface_embeddings(query)

raw_query_embedding

array([ 5.29357232e-02, -6.24647178e-02, -2.87437718e-02,  1.83179416e-02,
       -4.33840672e-04,  4.03239094e-02, -7.76652806e-03, -2.74394872e-03,
        2.53445264e-02, -8.10819939e-02, -8.44583288e-03, -6.59269514e-03,
        4.16187495e-02,  3.98627296e-02,  2.82911733e-02,  2.84344628e-02,
        2.65303329e-02, -2.60126498e-02,  4.16299142e-02,  3.92820686e-02,
       -5.15580364e-02,  5.83349541e-02,  5.88829117e-03,  3.46064568e-02,
       -2.46872660e-03,  2.72809248e-02,  1.07212560e-02,  4.55760621e-02,
       -1.69188846e-02, -4.85301390e-02, -3.02424375e-02, -3.29698175e-02,
        2.46010050e-02,  3.23601812e-02,  1.16030503e-06,  9.71379410e-03,
       -3.70800160e-02,  1.84200946e-02, -1.39834182e-02,  4.25723344e-02,
        6.78140819e-02, -6.66246563e-02,  2.11651716e-02, -1.11712900e-03,
       -1.80115458e-02, -7.90139660e-02,  5.93152717e-02, -5.23733571e-02,
        5.63013554e-02,  4.31280173e-02,  7.77091645e-03, -2.30586994e-02,
       -2.94572674e-02,  

In [None]:
# Feel free to change the "top_k" parameter to be a higher or lower number
top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=5, include_metadata=True, namespace="https://github.com/parky-sood/ai-pr-reviews")

In [None]:
top_matches

{'matches': [],
 'namespace': 'https://github.com/parky-sood/ai-pr-reviews',
 'usage': {'read_units': 1}}

In [None]:
contexts = [item['metadata']['text'] for item in top_matches['matches']]

In [None]:
contexts

[]

In [None]:
augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

In [None]:
print(augmented_query)

<CONTEXT>

-------
</CONTEXT>



MY QUESTION:
How are python files parsed?


In [None]:
system_prompt = f"""You are a Senior Software Engineer, specializing in TypeScript, Python, Java, C++, Go, Rust, C, and Swift.

Answer any questions I have about the codebase, based on the code provided. Always consider all of the context provided when forming a response.
"""

llm_response = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": augmented_query}
    ]
)

response = llm_response.choices[0].message.content

In [None]:
response

'Python files are parsed as follows:\n\n1. **Lexical Analysis**: The Python interpreter reads the Python script and breaks it down into a series of tokens. These tokens can be keywords, identifiers, literals, operators, etc.\n\n2. **Syntax Analysis**: The tokens are then passed through a syntax analyzer, also known as a parser. The parser checks whether the tokens form a valid Python program according to the language\'s syntax rules. The parser creates a parse tree, which is a hierarchical representation of the Python program\'s syntax.\n\n3. **Abstract Syntax Tree (AST) Creation**: After the syntax analysis, an Abstract Syntax Tree (AST) is created. The AST is a tree representation of the source code, where each node represents a construct in the source code. This is the data structure that will be executed by the Python interpreter.\n\n4. **Bytecode Generation**: The AST is then compiled into bytecode. This is platform-independent, intermediate code that can be executed by the Python

# Putting it all together

In [None]:
def perform_rag(query):
    raw_query_embedding = get_huggingface_embeddings(query)

    top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=5, include_metadata=True, namespace="https://github.com/parky-sood/ai-pr-reviews")

    # Get the list of retrieved texts
    contexts = [item['metadata']['text'] for item in top_matches['matches']]

    augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

    # Modify the prompt below as need to improve the response quality
    system_prompt = f"""You are a Senior Software Engineer, specializing in TypeScript.

    Answer any questions I have about the codebase, based on the code provided. Always consider all of the context provided when forming a response.
    """

    llm_response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": augmented_query}
        ]
    )

    return llm_response.choices[0].message.content

In [None]:
response = perform_rag("How is the javascript parser used?")

print(response)

The JavascriptParser is used in the review.ts file, specifically in the function `diffContextPerHunk` and `functionContextPatchStrategy`.

Here's a simplified explanation of how it's used:

1. The function `diffContextPerHunk` gets a PRFile (pull request file) and an AbstractParser (which can be an instance of JavascriptParser).
2. It applies the patch to the file contents using the diff library and gets the patches.
3. It loops through each hunk of the patches and uses the parser to find the enclosing context (i.e., the function, class, or block that contains the changes) of each change.
4. It then uses this enclosing context to build a string that represents the context of the changes.
5. This context string is created by injecting the changes into the surrounding code of the enclosing context.
6. This is repeated for each hunk in the patches.
7. Finally, this context is used to build the patch string.

In the function `functionContextPatchStrategy`, a specific strategy is chosen:

1

# Making API for RAG
