# Milestone Project: DevScribe v1.0.0
## The Enterprise-Grade RAG Architect

###1. Setup and Installations


In [1]:
!pip install -q langchain langchain-openai langchain-community pypdf faiss-cpu sentence-transformers chromadb python-dotenv langchain-experimental langchain-google-genai

In [2]:
!pip install -q chromadb
!pip install -q openai tiktoken
!pip install -q markdown unstructured
!pip install -q unstructured[md]
!pip install -q sentence-transformers
!pip install -q ragas
!pip install -q python-dotenv pydantic


###Environment Configuration

In [4]:
import os
from google.colab import userdata

# Set up Google API Key
os.environ['GOOGLE_API_KEY'] = userdata.get("GOOGLE_API_KEY")
if not os.environ['GOOGLE_API_KEY']:
    raise ValueError("GOOGLE_API_KEY not found. Please set it in your .env file.")

In [5]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2)

###The Ingestion Pipeline

In [6]:
!pip install -qU "langchain-chroma>=0.1.2"
!pip install -q langchain_huggingface

In [7]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import os




#####Company Knowledge Base

In [8]:
os.makedirs('knowledge_base', exist_ok=True)

with open('knowledge_base/style_guide.md', 'w') as f:
    f.write("# Python Style Guide\n\n")
    f.write("## Naming Conventions\n")
    f.write("- All functions and variables must use snake_case naming convention.\n")
    f.write("## Documentation\n")
    f.write("- All functions must have docstrings in Google Format.\n")

with open('knowledge_base/security_policy.md', 'w') as f:
    f.write("# Security Protocols\n\n")
    f.write("## Logging\n")
    f.write("- Never use the print() function for production logs. Always use the internal 'app_logger'.\n")
    f.write("## Secret Management\n")
    f.write("- API keys and credentials must be loaded from environment variables using os.getenv().\n")

with open('knowledge_base/legacy_deprecation.md', 'w') as f:
    f.write("# Library Deprecation List\n\n")
    f.write("## Data Processing\n")
    f.write("- Do not use the 'pandas' library. It is deprecated for this project. Use 'polars' instead for all data manipulation.\n")

#####Document Splitting

In [9]:
# Define the headers we want to split on, and what name to give them in the metadata
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [10]:
all_documents = []
file_names = ['style_guide.md', 'security_policy.md', 'legacy_deprecation.md']

for file_name in file_names:
    with open(f'knowledge_base/{file_name}', 'r') as f:
        content = f.read()
        chunks = markdown_splitter.split_text(content)

        for chunk in chunks:
            chunk.metadata["source"] = file_name
            all_documents.append(chunk)

print(f"Total chunks created: {len(all_documents)}")

Total chunks created: 5


In [11]:
# Print the resulting chunks to see how the document was split
for i, doc in enumerate(all_documents):
    print(f"--- Chunk {i+1} ---")
    print(f"Content: '{doc.page_content.strip()}'")
    print(f"Metadata: {doc.metadata}\n")

--- Chunk 1 ---
Content: '- All functions and variables must use snake_case naming convention.'
Metadata: {'Header 1': 'Python Style Guide', 'Header 2': 'Naming Conventions', 'source': 'style_guide.md'}

--- Chunk 2 ---
Content: '- All functions must have docstrings in Google Format.'
Metadata: {'Header 1': 'Python Style Guide', 'Header 2': 'Documentation', 'source': 'style_guide.md'}

--- Chunk 3 ---
Content: '- Never use the print() function for production logs. Always use the internal 'app_logger'.'
Metadata: {'Header 1': 'Security Protocols', 'Header 2': 'Logging', 'source': 'security_policy.md'}

--- Chunk 4 ---
Content: '- API keys and credentials must be loaded from environment variables using os.getenv().'
Metadata: {'Header 1': 'Security Protocols', 'Header 2': 'Secret Management', 'source': 'security_policy.md'}

--- Chunk 5 ---
Content: '- Do not use the 'pandas' library. It is deprecated for this project. Use 'polars' instead for all data manipulation.'
Metadata: {'Header 1

#####Vector Store and Embeddings

In [12]:
import numpy as np
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

In [13]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")


##### `FAISS` (In-Memory)


In [14]:
from langchain_community.vectorstores import FAISS

faiss_vectorstore = FAISS.from_documents(
    documents=all_documents,
    embedding=embeddings
)

In [15]:
# query = "Is using print allowed in production code?"
# query = """
# This function prints messages to the console,
# uses pandas for reading CSV files,
# and the function name is written in CamelCase.
# Is this acceptable for production?
# """
query = """
Refactor a Python function that:
- uses print() for logging
- imports pandas
- has a CamelCase function name
"""


results = faiss_vectorstore.similarity_search(query, k=5)

for i, doc in enumerate(results, 1):
    print(f"Result {i}")
    print("Source:", doc.metadata["source"])
    print(doc.page_content)
    print("-" * 60)


Result 1
Source: security_policy.md
- Never use the print() function for production logs. Always use the internal 'app_logger'.
------------------------------------------------------------
Result 2
Source: legacy_deprecation.md
- Do not use the 'pandas' library. It is deprecated for this project. Use 'polars' instead for all data manipulation.
------------------------------------------------------------
Result 3
Source: style_guide.md
- All functions and variables must use snake_case naming convention.
------------------------------------------------------------
Result 4
Source: style_guide.md
- All functions must have docstrings in Google Format.
------------------------------------------------------------
Result 5
Source: security_policy.md
- API keys and credentials must be loaded from environment variables using os.getenv().
------------------------------------------------------------


#####`Chroma` (Persistent Storage)




In [16]:
!pip install -qU "langchain-chroma>=0.1.2"

In [17]:
from langchain_chroma import Chroma
persist_directory = "./chroma_db"

chroma_vectorstore = Chroma.from_documents(
    documents=all_documents,
    embedding=embeddings,
    persist_directory=persist_directory
)

print("ChromaDB has been created and saved to disk.")

ChromaDB has been created and saved to disk.


In [18]:
#  Test the similarity search
query = """
Refactor a Python function that:
- uses print() for logging
- imports pandas
- has a CamelCase function name
"""
retrieved_docs_chroma = chroma_vectorstore.similarity_search(query, k=2)

print("\n--- Chroma Similarity Search Results ---")
for doc in retrieved_docs_chroma:
    print(f"Content: '{doc.page_content[:300]}...'")
    print(f"Source (Page): {doc.metadata.get('page', 'N/A')}\n")


--- Chroma Similarity Search Results ---
Content: '- Never use the print() function for production logs. Always use the internal 'app_logger'....'
Source (Page): N/A

Content: '- Never use the print() function for production logs. Always use the internal 'app_logger'....'
Source (Page): N/A



In [19]:
print("\n--- Loading ChromaDB from disk ---")
new_db_instance = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings
)

# Test the loaded instance
retrieved_docs_loaded = new_db_instance.similarity_search(query, k=2)
print(f"Search results from the loaded instance are the same: {len(retrieved_docs_loaded) == len(retrieved_docs_chroma)}")


--- Loading ChromaDB from disk ---
Search results from the loaded instance are the same: True


In [20]:
vectorstore = Chroma.from_documents(collection_name="initial_knowledge_base_1",
                                    documents=all_documents,
                                    embedding=embeddings,
                                    collection_metadata={"hnsw:space": "cosine"}
                                    )
print("Knowledge base created successfully!")

Knowledge base created successfully!


In [21]:
# !pip install -U langchain langchain-community langchain-openai

In [22]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

query = [
    "uses print() for logging",
    "imports pandas",
    "has a CamelCase function name",
    "avoid global variables; use dependency injection"

]

for q in query:
    print("\nQuery:", q)
    results = retriever.invoke(q)
    for i, r in enumerate(results, 1):
        print(f"- [{i}] source={r.metadata.get('source')}, title={r.metadata.get('Header 1')}, section={r.metadata.get('Header 2')}")
        print("  snippet:", r.page_content[:100].replace("\n"," "))



Query: uses print() for logging
- [1] source=security_policy.md, title=Security Protocols, section=Logging
  snippet: - Never use the print() function for production logs. Always use the internal 'app_logger'.
- [2] source=legacy_deprecation.md, title=Library Deprecation List, section=Data Processing
  snippet: - Do not use the 'pandas' library. It is deprecated for this project. Use 'polars' instead for all d
- [3] source=style_guide.md, title=Python Style Guide, section=Naming Conventions
  snippet: - All functions and variables must use snake_case naming convention.
- [4] source=security_policy.md, title=Security Protocols, section=Secret Management
  snippet: - API keys and credentials must be loaded from environment variables using os.getenv().
- [5] source=style_guide.md, title=Python Style Guide, section=Documentation
  snippet: - All functions must have docstrings in Google Format.

Query: imports pandas
- [1] source=legacy_deprecation.md, title=Library Deprecation List, secti

###Advanced Retrieval

#####Cross-Encoder Reranking

In [23]:
from sentence_transformers import CrossEncoder
from langchain_core.documents import Document

In [24]:
reranker = CrossEncoder(
    "cross-encoder/ms-marco-MiniLM-L-6-v2"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [25]:
def rerank_documents(query, documents, top_k=3):
    """
    Rerank retrieved documents using a Cross-Encoder.
    """
    pairs = [(query, doc.page_content) for doc in documents]

    scores = reranker.predict(pairs )

    scored_docs = list(zip(documents, scores))
    scored_docs.sort(key=lambda x: x[1], reverse=True)

    return [doc for doc, score in scored_docs[:top_k]]

print("Cross-Encoder reranker ready.")



Cross-Encoder reranker ready.


#####MultiQueryRetriever

In [26]:
from langchain_classic.retrievers.multi_query import MultiQueryRetriever
from langchain_google_genai import ChatGoogleGenerativeAI


llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

In [27]:
from langchain_core.prompts import PromptTemplate


question= "Refactor this code for production use"

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""
You are an AI assistant that generates multiple search queries
to retrieve relevant internal engineering rules.

Given the user question:
{question}

Generate 3 different search queries.
"""
)


In [28]:
multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=retriever,
    llm=llm,
    prompt=QUERY_PROMPT
)


In [30]:
unique_docs = multiquery_retriever.invoke(question)

print(f"\nRetrieved {len(unique_docs)} unique documents after multi-query expansion.")


Retrieved 5 unique documents after multi-query expansion.


In [31]:
for doc in unique_docs:
    print(doc.page_content)

- All functions must have docstrings in Google Format.
- Never use the print() function for production logs. Always use the internal 'app_logger'.
- Do not use the 'pandas' library. It is deprecated for this project. Use 'polars' instead for all data manipulation.
- API keys and credentials must be loaded from environment variables using os.getenv().
- All functions and variables must use snake_case naming convention.


In [32]:
def build_context(docs):
    return "\n\n".join(
        f"[SOURCE: {doc.metadata.get('source')}]\n{doc.page_content}"
        for doc in docs
    )

context = build_context(unique_docs)


###RAG-Driven Refactoring

In [33]:
from pydantic import BaseModel, Field
from typing import List

class RAGRefactorSuggestion(BaseModel):
    critique: str
    refactored_code: str
    changes_made: List[str]
    policy_citations: List[str]

In [34]:

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser

In [35]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("""
You are an enterprise-grade code refactoring assistant.

RULES:
- Use ONLY the provided context.
- Do NOT invent rules or best practices.
- Every change MUST be justified by a policy citation.
- If the context does not mention a rule, DO NOT apply it.

Context:
{context}

Code:
{code}

Return a JSON object with the following fields:
- critique
- refactored_code
- changes_made
- policy_citations
""")


In [36]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0
)

In [37]:
refactor_chain = prompt | llm.with_structured_output(RAGRefactorSuggestion)

In [38]:
test_code = """
def Load_Data(filename):
    print(f"Loading {filename}...")
    import pandas as pd
    df = pd.read_csv(filename)
    return df
"""
response = refactor_chain.invoke({
    "context": context,
    "code": test_code
})

In [39]:
print("\nRefactored Code:\n", response.refactored_code)
print("\nChanges Made:\n", response.changes_made)
print("\nPolicy Citations:\n", response.policy_citations)


Refactored Code:
 import polars as pl
# Assuming app_logger is configured and available globally or imported
# from your application's logging module.
# import logging
# app_logger = logging.getLogger(__name__)

def load_data(filename):
    """Loads data from a CSV file using Polars.

    Args:
        filename (str): The path to the CSV file.

    Returns:
        polars.DataFrame: The loaded DataFrame.
    """
    app_logger.info(f"Loading {filename}...")
    df = pl.read_csv(filename)
    return df

Changes Made:
 ['Renamed the function `Load_Data` to `load_data` to follow snake_case.', 'Added a docstring in Google Format to the `load_data` function.', 'Replaced `print()` with `app_logger.info()` for production logging.', 'Replaced the `pandas` library import (`import pandas as pd`) with `polars` (`import polars as pl`).', 'Replaced `pd.read_csv()` with `pl.read_csv()` to use the Polars library.']

Policy Citations:
 ['style_guide.md', 'style_guide.md', 'security_policy.md', 'legac

In [40]:
class EvalScore(BaseModel):
    score: float
    reasoning: str

judge_prompt = ChatPromptTemplate.from_template("""
You are an impartial evaluator for a Retrieval-Augmented Generation (RAG) system.

Evaluate the assistant's answer using ONLY the provided context.

Context:
{context}

Original Code:
{code}

Assistant Answer:
{answer}

Answer the following:

1. Faithfulness:
Did the assistant make ONLY claims and changes that are explicitly supported by the context?
Score from 0.0 (not faithful) to 1.0 (fully faithful).

2. Context Relevancy:
Were the retrieved documents relevant to the code issues addressed?
Score from 0.0 (irrelevant) to 1.0 (highly relevant).

Return your response strictly as JSON:
{{
  "faithfulness_score": float,
  "context_relevancy_score": float,
  "explanation": string
}}
""")


In [41]:
judge_llm = llm.with_structured_output(EvalScore)
eval_chain = judge_prompt | judge_llm

In [42]:
eval_result = eval_chain.invoke({
    "context": context,
    "code": test_code,
    "answer": response.refactored_code
})


print(eval_result)

score=1.0 reasoning="The assistant's answer is fully faithful to the provided context. Every change made by the assistant (function name to snake_case, replacing print() with app_logger, replacing pandas with polars, and adding a Google-formatted docstring) is directly supported by a specific rule in the context. There are no claims or changes that go beyond the information given. The context provided was highly relevant to the issues present in the original code, as the assistant addressed all identified violations using the provided documentation. Although one piece of information (about API keys) in the overall context was not applicable to this specific code, the documents that *were* relevant perfectly addressed the code issues."


###Agantic Rag

In [43]:
from langchain.agents import create_agent
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.documents import Document
from typing import List, Tuple

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

#####1: Style Guide Search

In [51]:
from langchain.tools import tool

@tool(
    description="Search the company style guide for formatting and naming rules",
    response_format="content_and_artifact"
)

def search_style_guide(query: str) -> Tuple[str, List[Document]]:
    """
    Search the internal company style guide.
    """
    print(f"--- Agent: Searching STYLE guide for '{query}' ---")
    docs = multiquery_retriever.invoke(query)
    content = "\n\n".join(d.page_content for d in docs)
    return content, docs

#####2: Security Policy Search



In [52]:
@tool(
    description="Search the company security database for security and logging policies",
    response_format="content_and_artifact"
)
def search_security_db(query: str) -> Tuple[str, List[Document]]:
    """
    Search the internal company security policies.
    """
    print(f"--- Agent: Searching SECURITY DB for '{query}' ---")
    docs = multiquery_retriever.invoke(query)
    content = "\n\n".join(d.page_content for d in docs)
    return content, docs

In [53]:
tools = [search_style_guide,search_security_db]

print("Agent tools initialized and ready.")

Agent tools initialized and ready.


In [54]:
system_prompt = (
    "You are a strict enterprise-grade code refactoring agent. "
    "You have access to the following tools: search_style_guide and search_security_db. "
    "You must analyze the code and decide which tool(s) to use based on the type of violations detected. "
    "You are ONLY allowed to refactor the code using rules retrieved from these tools. "
    "Do NOT apply general best practices or external knowledge. "
    "For every change you make, you MUST cite the exact source file that justifies the change."
)


In [55]:
agent = create_agent(llm, tools=tools, system_prompt=system_prompt)
print("Agent with Style & Security tools is ready.")


Agent with Style & Security tools is ready.


In [56]:
test_code = """
def Load_Data(filename):
    print(f"Loading {filename}...")
    import pandas as pd
    df = pd.read_csv(filename)
    return df
"""
print("Test Input Ready")

Test Input Ready


In [57]:
events = []
for event in agent.stream({"messages": [("user", test_code)]}, stream_mode="values"):
    events.append(event)
    event["messages"][-1].pretty_print()

final_answer = events[-1]["messages"][-1].content
print("\n--- Agent's Final Refactored Output ---")
print(final_answer)




def Load_Data(filename):
    print(f"Loading {filename}...")
    import pandas as pd
    df = pd.read_csv(filename)
    return df

Tool Calls:
  search_style_guide (c6a3368b-6668-45e7-9496-0b8fe18ae7f3)
 Call ID: c6a3368b-6668-45e7-9496-0b8fe18ae7f3
  Args:
    query: function naming convention
  search_style_guide (28621688-1fd0-4ddd-86c4-7c06e81a64b9)
 Call ID: 28621688-1fd0-4ddd-86c4-7c06e81a64b9
  Args:
    query: import statements placement
--- Agent: Searching STYLE guide for 'function naming convention' ---
--- Agent: Searching STYLE guide for 'import statements placement' ---
Name: search_style_guide

- All functions must have docstrings in Google Format.

- Never use the print() function for production logs. Always use the internal 'app_logger'.

- Do not use the 'pandas' library. It is deprecated for this project. Use 'polars' instead for all data manipulation.

- API keys and credentials must be loaded from environment variables using os.getenv().

- All functions and vari