In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone
import os
import tempfile
from github import Github, Repository
from git import Repo
from openai import OpenAI
from pathlib import Path
from langchain.schema import Document
from pinecone import Pinecone

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clone_repository(repo_url):
    """Clones a GitHub repository to a temporary directory.

    Args:
        repo_url: The URL of the GitHub repository.

    Returns:
        The path to the cloned repository.
    """
    repo_name = repo_url.split("/")[-1]  # Extract repository name from URL
    repo_path = f"{repo_name}"
    Repo.clone_from(repo_url, str(repo_path))
    return str(repo_path)

In [4]:
path = clone_repository("https://github.com/CoderAgent/SecureAgent")

In [10]:
SUPPORTED_EXTENSIONS = {
    ".py",
    ".js",
    ".tsx",
    ".jsx",
    ".ipynb",
    ".java",
    ".cpp",
    ".ts",
    ".go",
    ".rs",
    ".vue",
    ".swift",
    ".c",
    ".h",
}

IGNORED_DIRS = {
    "node_modules",
    "venv",
    "env",
    "dist",
    "build",
    ".git",
    "__pycache__",
    ".next",
    ".vscode",
    "vendor",
}

In [5]:
def get_file_content(file_path, repo_path):
    """
    Get content of a single file.

    Args:
        file_path (str): Path to the file

    Returns:
        Optional[Dict[str, str]]: Dictionary with file name and content
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        # Get relative path from repo root
        rel_path = os.path.relpath(file_path, repo_path)

        return {"name": rel_path, "content": content}
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None


def get_main_files_content(repo_path: str):
    """
    Get content of supported code files from the local repository.

    Args:
        repo_path: Path to the local repository

    Returns:
        List of dictionaries containing file names and contents
    """
    files_content = []

    try:
        for root, _, files in os.walk(repo_path):
            # Skip if current directory is in ignored directories
            if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
                continue

            # Process each file in current directory
            for file in files:
                file_path = os.path.join(root, file)
                if os.path.splitext(file)[1] in SUPPORTED_EXTENSIONS:
                    file_content = get_file_content(file_path, repo_path)
                    if file_content:
                        files_content.append(file_content)

    except Exception as e:
        print(f"Error reading repository: {str(e)}")

    return files_content

In [6]:
file_content = get_main_files_content(path)

Error reading repository: name 'IGNORED_DIRS' is not defined


In [7]:
file_content

[]

In [8]:
def get_huggingface_embeddings(
    text, model_name="sentence-transformers/all-mpnet-base-v2"
):
    model = SentenceTransformer(model_name)
    return model.encode(text)

In [9]:
text = "I am a programmer"

embeddings = get_huggingface_embeddings(text)

In [10]:
embeddings
embeddings.shape

(768,)

In [None]:
from pinecone import ServerlessSpec
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
cloud = os.environ.get("PINECONE_CLOUD") or "aws"
region = os.environ.get("PINECONE_REGION") or "us-east-1"

spec = ServerlessSpec(cloud=cloud, region=region)
index_name = 'codebase-rag'
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=768,  # dimensionality of text-embedding-ada-002
        metric="cosine",
        spec=spec,
    )
# connect to index
pinecone_index = pc.Index(index_name)
# view index stats
pinecone_index.describe_index_stats()

In [25]:
documents = []

for file in file_content:
    doc = Document(
        page_content=f"{file['name']}\n{file['content']}",
        metadata={"source": file["name"]},
    )

    documents.append(doc)


vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=HuggingFaceEmbeddings(),
    index_name="codebase-rag",
    namespace="https://github.com/CoderAgent/SecureAgent",
)

  embedding=HuggingFaceEmbeddings(),


In [26]:
client = OpenAI(
    base_url="https://api.groq.com/openai/v1", api_key=os.environ.get("GROQ_API_KEY")
)

In [27]:
query = "How are python files parsed?"

In [28]:
raw_query_embedding = get_huggingface_embeddings(query)

raw_query_embedding

array([ 5.29357493e-02, -6.24646954e-02, -2.87437644e-02,  1.83179360e-02,
       -4.33842099e-04,  4.03239131e-02, -7.76652619e-03, -2.74391589e-03,
        2.53445171e-02, -8.10819641e-02, -8.44586920e-03, -6.59264717e-03,
        4.16187868e-02,  3.98627222e-02,  2.82911900e-02,  2.84344498e-02,
        2.65302975e-02, -2.60126423e-02,  4.16299067e-02,  3.92820314e-02,
       -5.15580326e-02,  5.83349802e-02,  5.88832982e-03,  3.46065387e-02,
       -2.46875291e-03,  2.72808522e-02,  1.07212598e-02,  4.55761440e-02,
       -1.69189125e-02, -4.85301316e-02, -3.02425046e-02, -3.29697691e-02,
        2.46010013e-02,  3.23601738e-02,  1.16030515e-06,  9.71375313e-03,
       -3.70800309e-02,  1.84201226e-02, -1.39834331e-02,  4.25723009e-02,
        6.78140894e-02, -6.66247234e-02,  2.11651586e-02, -1.11712888e-03,
       -1.80115104e-02, -7.90140182e-02,  5.93152903e-02, -5.23733832e-02,
        5.63013740e-02,  4.31280062e-02,  7.77091179e-03, -2.30586510e-02,
       -2.94572152e-02,  

In [29]:
# Feel free to change the "top_k" parameter to be a higher or lower number
top_matches = pinecone_index.query(
    vector=raw_query_embedding.tolist(),
    top_k=5,
    include_metadata=True,
    namespace="https://github.com/CoderAgent/SecureAgent",
)
top_matches

{'matches': [{'id': 'bf11b77d-efbb-4515-aba2-d8b7ff9b7218',
              'metadata': {'source': 'src/context/language/python-parser.ts',
                           'text': 'src/context/language/python-parser.ts\n'
                                   'import { AbstractParser, EnclosingContext '
                                   '} from "../../constants";\n'
                                   'export class PythonParser implements '
                                   'AbstractParser {\n'
                                   '  findEnclosingContext(\n'
                                   '    file: string,\n'
                                   '    lineStart: number,\n'
                                   '    lineEnd: number\n'
                                   '  ): EnclosingContext {\n'
                                   '    // TODO: Implement this method for '
                                   'Python\n'
                                   '    return null;\n'
                          

In [30]:
contexts = [item["metadata"]["text"] for item in top_matches["matches"]]
contexts

['src/context/language/python-parser.ts\nimport { AbstractParser, EnclosingContext } from "../../constants";\nexport class PythonParser implements AbstractParser {\n  findEnclosingContext(\n    file: string,\n    lineStart: number,\n    lineEnd: number\n  ): EnclosingContext {\n    // TODO: Implement this method for Python\n    return null;\n  }\n  dryRun(file: string): { valid: boolean; error: string } {\n    // TODO: Implement this method for Python\n    return { valid: false, error: "Not implemented yet" };\n  }\n}\n',
 'src/context/language/javascript-parser.ts\nimport { AbstractParser, EnclosingContext } from "../../constants";\nimport * as parser from "@babel/parser";\nimport traverse, { NodePath, Node } from "@babel/traverse";\n\nconst processNode = (\n  path: NodePath<Node>,\n  lineStart: number,\n  lineEnd: number,\n  largestSize: number,\n  largestEnclosingContext: Node | null\n) => {\n  const { start, end } = path.node.loc;\n  if (start.line <= lineStart && lineEnd <= end.li

In [31]:
augmented_query = (
    "<CONTEXT>\n"
    + "\n\n-------\n\n".join(contexts[:10])
    + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n"
    + query
)
print(augmented_query)

<CONTEXT>
src/context/language/python-parser.ts
import { AbstractParser, EnclosingContext } from "../../constants";
export class PythonParser implements AbstractParser {
  findEnclosingContext(
    file: string,
    lineStart: number,
    lineEnd: number
  ): EnclosingContext {
    // TODO: Implement this method for Python
    return null;
  }
  dryRun(file: string): { valid: boolean; error: string } {
    // TODO: Implement this method for Python
    return { valid: false, error: "Not implemented yet" };
  }
}


-------

src/context/language/javascript-parser.ts
import { AbstractParser, EnclosingContext } from "../../constants";
import * as parser from "@babel/parser";
import traverse, { NodePath, Node } from "@babel/traverse";

const processNode = (
  path: NodePath<Node>,
  lineStart: number,
  lineEnd: number,
  largestSize: number,
  largestEnclosingContext: Node | null
) => {
  const { start, end } = path.node.loc;
  if (start.line <= lineStart && lineEnd <= end.line) {
    const

In [32]:
system_prompt = f"""You are a Senior Software Engineer, specializing in TypeScript.

Answer any questions I have about the codebase, based on the code provided. Always consider all of the context provided when forming a response.
"""

llm_response = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": augmented_query},
    ],
)

response = llm_response.choices[0].message.content

In [33]:
response

'According to the provided code, Python files are not fully parsed yet. \n\nThere is a `PythonParser` class in `src/context/language/python-parser.ts`, but its methods, `findEnclosingContext` and `dryRun`, are still in the TODO state and return null or an error message.\n\nTo parse Python files, you would need to implement these methods according to your requirements, using a Python parser library such as the `ast` module from the Python standard library or a third-party library like `pyesprima`.\n\nIt\'s also worth noting that the `JavascriptParser` in `src/context/language/javascript-parser.ts` is using `@babel/parser` to parse JavaScript files. You could follow a similar approach and use a Python parser library to parse Python files.\n\nHere\'s a basic example of how you could use the `ast` module to parse a Python file:\n\n```typescript\nimport * as pythonAst from "ast";\n\nexport class PythonParser implements AbstractParser {\n  findEnclosingContext(\n    file: string,\n    lineSt

In [34]:
def perform_rag(query):
    raw_query_embedding = get_huggingface_embeddings(query)

    top_matches = pinecone_index.query(
        vector=raw_query_embedding.tolist(),
        top_k=5,
        include_metadata=True,
        namespace="https://github.com/CoderAgent/SecureAgent",
    )

    # Get the list of retrieved texts
    contexts = [item["metadata"]["text"] for item in top_matches["matches"]]

    augmented_query = (
        "<CONTEXT>\n"
        + "\n\n-------\n\n".join(contexts[:10])
        + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n"
        + query
    )

    # Modify the prompt below as need to improve the response quality
    system_prompt = f"""You are a Senior Software Engineer, specializing in TypeScript.

    Answer any questions I have about the codebase, based on the code provided. Always consider all of the context provided when forming a response.
    """

    llm_response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": augmented_query},
        ],
    )

    return llm_response.choices[0].message.content

In [35]:
response = perform_rag("How is the javascript parser used?")

print(response)

The JavaScript parser, specifically the `JavascriptParser` class in `./context/language/javascript-parser.ts`, is used in the `review.ts` file within the `diffContextPerHunk` function.

Here's a simplified breakdown of how it's used:

1. In the `diffContextPerHunk` function, it constructs a `Parser` object based on the file extension using the `getParserForExtension` function from the `constants.ts` file. If the file extension is supported (JavaScript, TypeScript, or JSX), it gets an instance of the `JavascriptParser` class.
   
   ```typescript
const parser: AbstractParser = getParserForExtension(file.filename);
```
   
   The `getParserForExtension` function uses the `EXTENSIONS_TO_PARSERS` map in `constants.ts` to get the parser based on the file extension.

2. Once the `parser` object is obtained, it passes the parser and the PR file (`file`) to the `findEnclosingContext` method of the parser.

   ```typescript
const largestEnclosingFunction = parser.findEnclosingContext(
  updated

In [None]:
%%writefile app.py
import streamlit as st
import pinecone
import os
import numpy as np
from langchain_openai import OpenAI
from langchain_huggingface import HuggingFaceEmbeddings  
from langchain_community.vectorstores import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.chains import ConversationalRetrievalChain
from pinecone import ServerlessSpec

# test
api_key = os.environ.get("PINECONE_API_KEY")
pc = pinecone.Pinecone(api_key=api_key)
cloud = os.environ.get("PINECONE_CLOUD") or "aws"
region = os.environ.get("PINECONE_REGION") or "us-east-1"

spec = ServerlessSpec(cloud=cloud, region=region)
index_name = 'codebase-rag'

pinecone_index = pc.Index(index_name)

client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=os.environ.get("GROQ_API_KEY")
)

# Initialize HuggingFace embeddings model
embeddings = HuggingFaceEmbeddings()

# Create a Pinecone vector store
vectorstore = PineconeVectorStore(index=pinecone_index, embedding=embeddings)

# Initialize the Conversational Retrieval Chain
qa_chain = ConversationalRetrievalChain.from_llm(client, vectorstore.as_retriever())

# Streamlit UI
st.title('🦜🔗 Codebase RAG')

def perform_rag(query):
    # Perform retrieval from Pinecone index using the query embedding
    raw_query_embedding = embeddings.embed_query(query)

    # Ensure the embedding is in the correct format (list of floats)
    if isinstance(raw_query_embedding, list):
        raw_query_embedding = np.array(raw_query_embedding)

    # Query Pinecone for relevant matches
    top_matches = pinecone_index.query(
        vector=raw_query_embedding.tolist(),
        top_k=5,
        include_metadata=True,
        namespace="https://github.com/CoderAgent/SecureAgent",
    )

    # Get the list of retrieved texts
    contexts = [item["metadata"]["text"] for item in top_matches["matches"]]

    # Augment the query with the retrieved context
    augmented_query = (
        "<CONTEXT>\n"
        + "\n\n-------\n\n".join(contexts[:10])
        + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n"
        + query
    )

    # Modify the prompt for the LLM
    system_prompt = f"""You are a Senior Software Engineer, specializing in TypeScript.
    
    Answer any questions I have about the codebase, based on the code provided. Always consider all of the context provided when forming a response.
    """

    # Get the LLM response
    llm_response = client.call(
        model="gpt-3.5-turbo",  # Use a specific model, like gpt-3.5-turbo
        messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": augmented_query}],
    )

    print(llm_response)

    # Check if the response contains 'choices' and access the message content
    if 'choices' in llm_response and len(llm_response['choices']) > 0:
        return llm_response['choices'][0].get('message', {}).get('content', 'No response')
    else:
        return 'No valid response returned.'

# Streamlit form to capture user input
with st.form('my_form'):
    text = st.text_area('Enter your query:')
    submitted = st.form_submit_button('Submit')

    if submitted:
        if text:
            # Get the response from the perform_rag function
            response = perform_rag(text)
            st.write(response)
        else:
            st.write("Please enter a query.")

Overwriting app.py
