In [23]:
# Installing dependencies
# !pip install langchain langchain-core langchain-community openai tiktoken python-dotenv tqdm

In [35]:
import os
import json
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm
import time

# LangChain components for LLM integration
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.docstore.document import Document

start = time.time()

# ───────────────────────────────────────────────────────────────
# Step 1: Load API key securely from .env
# ───────────────────────────────────────────────────────────────
load_dotenv()
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")
assert TOGETHER_API_KEY, "❌ TOGETHER_API_KEY not found in .env"

# ───────────────────────────────────────────────────────────────
# Step 2: Define codebase path and file types to process
# ───────────────────────────────────────────────────────────────
codebase_path = Path("SakilaProject")  # Local directory path
assert codebase_path.exists(), "❌ SakilaProject directory not found"

# File types relevant to SakilaProject (multi-language support)
file_extensions = [".java", ".sql", ".xml", ".properties"]

# ───────────────────────────────────────────────────────────────
# Step 3: Load and parse code files into document objects
# ───────────────────────────────────────────────────────────────
def load_code_files(base_path):
    docs = []
    for ext in file_extensions:
        for file in base_path.rglob(f"*{ext}"):
            try:
                content = file.read_text(encoding="utf-8")
                docs.append(Document(page_content=content, metadata={"source": str(file)}))
            except Exception as e:
                print(f"⚠️ Skipping {file} due to error: {e}")
    return docs

docs = load_code_files(codebase_path)

# ───────────────────────────────────────────────────────────────
# Step 4: Chunk files to fit within LLM token limits
# ───────────────────────────────────────────────────────────────
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
chunks = splitter.split_documents(docs)

# ───────────────────────────────────────────────────────────────
# Step 5: Initialize Together.ai LLM via LangChain
# ───────────────────────────────────────────────────────────────
llm = ChatOpenAI(
    model="mistralai/Mistral-7B-Instruct-v0.1",
    openai_api_key=TOGETHER_API_KEY,
    openai_api_base="https://api.together.xyz/v1",
    temperature=0.1,
    max_tokens=1000
)

# ───────────────────────────────────────────────────────────────
# Step 6: Prompt template to guide LLM in structured analysis
# ───────────────────────────────────────────────────────────────
prompt = PromptTemplate.from_template("""
You are a software architect analyzing a code snippet. The code below is from a Java, SQL, or XML file:

{code}

Extract the following:
1. What is the purpose of this file or component?
2. What methods/functions are defined? (Name, signature, purpose)
3. Any comments on structure or complexity?

Return your answer strictly in JSON format:
{{
  "summary": "...",
  "methods": [
    {{
      "name": "...",
      "signature": "...",
      "description": "..."
    }}
  ],
  "complexity_notes": "..."
}}
""")

# Wrap the prompt and LLM into a LangChain chain
chain = LLMChain(llm=llm, prompt=prompt)

# ───────────────────────────────────────────────────────────────
# Step 7: Analyze each chunk and build structured output
# ───────────────────────────────────────────────────────────────
results_by_file = {}

for chunk in tqdm(chunks, desc="🔍 Processing code chunks"):
    source = chunk.metadata["source"]
    result = chain.run(code=chunk.page_content)  # Send to LLM

    # Initialize JSON structure for each file if not already
    if source not in results_by_file:
        results_by_file[source] = {
            "summary": "",
            "methods": [],
            "complexity_notes": ""
        }

    # Attempt to parse LLM response
    try:
        parsed = json.loads(result)

        # Merge into file-specific structure
        if parsed.get("summary"):
            results_by_file[source]["summary"] = parsed["summary"]

        if parsed.get("methods"):
            results_by_file[source]["methods"].extend(parsed["methods"])

        if parsed.get("complexity_notes"):
            results_by_file[source]["complexity_notes"] += parsed["complexity_notes"] + " "

    except Exception as e:
        print(f"⚠️ Could not parse output for {source}: {e}")

# ───────────────────────────────────────────────────────────────
# Step 8: Export structured results to JSON
# ───────────────────────────────────────────────────────────────
output_path = "sakila_structured_knowledge.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results_by_file, f, indent=2)

end = time.time()

print(f"\n✅ Knowledge extraction complete. Output saved to: {output_path}")
print(f"Total runtime: {round(end - start)} seconds")

🔍 Processing code chunks:  33%|█████████████████▉                                     | 28/86 [00:58<03:56,  4.07s/it]

⚠️ Could not parse output for SakilaProject\src\main\java\com\sparta\engineering72\sakilaproject\entities\Film.java: Unterminated string starting at: line 112 column 22 (char 3579)


🔍 Processing code chunks:  41%|██████████████████████▍                                | 35/86 [01:21<02:50,  3.34s/it]

⚠️ Could not parse output for SakilaProject\src\main\java\com\sparta\engineering72\sakilaproject\entities\FilmText.java: Invalid \escape: line 2 column 126 (char 128)


🔍 Processing code chunks:  55%|██████████████████████████████                         | 47/86 [01:56<01:27,  2.26s/it]

⚠️ Could not parse output for SakilaProject\src\main\java\com\sparta\engineering72\sakilaproject\respositories\FilmRepository.java: Expecting ',' delimiter: line 9 column 1 (char 353)


🔍 Processing code chunks:  84%|██████████████████████████████████████████████         | 72/86 [02:42<00:24,  1.77s/it]

⚠️ Could not parse output for SakilaProject\src\test\java\com\sparta\engineering72\sakilaproject\MockTests.java: Expecting ',' delimiter: line 9 column 1 (char 422)


🔍 Processing code chunks:  86%|███████████████████████████████████████████████▎       | 74/86 [02:46<00:20,  1.73s/it]

⚠️ Could not parse output for SakilaProject\src\test\java\com\sparta\engineering72\sakilaproject\MockTests.java: Expecting ',' delimiter: line 9 column 1 (char 321)


🔍 Processing code chunks:  88%|████████████████████████████████████████████████▌      | 76/86 [02:50<00:18,  1.85s/it]

⚠️ Could not parse output for SakilaProject\src\test\java\com\sparta\engineering72\sakilaproject\MockTests.java: Expecting ',' delimiter: line 9 column 1 (char 383)


🔍 Processing code chunks: 100%|███████████████████████████████████████████████████████| 86/86 [03:05<00:00,  2.16s/it]


✅ Knowledge extraction complete. Output saved to: sakila_structured_knowledge.json
Total runtime: 186 seconds



