In [None]:
import os
import json
import pickle
import ast
import numpy as np
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import faiss

load_dotenv()

print("Libraries imported and environment variables loaded.")

In [None]:
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'

CORPUS_DIR = os.path.join('data', 'reference_corpus')

INDEX_DIR = 'indexes'

os.makedirs(INDEX_DIR, exist_ok=True)

FAISS_INDEX_PATH = os.path.join(INDEX_DIR, 'faiss_index.bin')
BM25_INDEX_PATH = os.path.join(INDEX_DIR, 'bm25_index.pkl')
METADATA_PATH = os.path.join(INDEX_DIR, 'metadata.pkl')

print(f"Corpus directory: {CORPUS_DIR}")
print(f"Index directory: {INDEX_DIR}")

In [None]:
def extract_functions_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    functions = []
    try:
        tree = ast.parse(content, filename=file_path)
        
        for node in ast.walk(tree):
            if isinstance(node, ast.FunctionDef):
                function_name = node.name
                function_code = ast.unparse(node)
                
                functions.append({
                    'file_path': file_path,
                    'function_name': function_name,
                    'code': function_code
                })
    except SyntaxError as e:
        print(f"Could not parse {file_path}: {e}")
        
    return functions

def load_and_chunk_corpus(directory):
    all_chunks = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.py'):
                file_path = os.path.join(root, file)
                file_functions = extract_functions_from_file(file_path)
                all_chunks.extend(file_functions)
    
    return all_chunks

print("Loading and chunking source code from the reference corpus...")
code_chunks = load_and_chunk_corpus(CORPUS_DIR)

print(f"Successfully extracted {len(code_chunks)} functions (chunks) from the corpus.")
if code_chunks:
    print("\n--- Sample Chunk ---")
    print(json.dumps(code_chunks[0], indent=2))

In [None]:
print("Initializing embedding model...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL)

corpus_codes = [chunk['code'] for chunk in code_chunks]

print(f"Generating embeddings for {len(corpus_codes)} code chunks. This may take a moment...")

corpus_embeddings = embedding_model.encode(corpus_codes, show_progress_bar=False)

embedding_dim = corpus_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(embedding_dim)

faiss_index.add(corpus_embeddings)

print(f"FAISS index built. Total vectors in index: {faiss_index.ntotal}")

faiss.write_index(faiss_index, FAISS_INDEX_PATH)
print(f"FAISS index saved to: {FAISS_INDEX_PATH}")

In [None]:
print("--- Running Diagnostics on 'code_chunks' Variable ---")
if 'code_chunks' in locals():
    print(f"The variable 'code_chunks' exists.")
    print(f"Type of code_chunks: {type(code_chunks)}")
    print(f"Length of code_chunks: {len(code_chunks)}")
    
    if len(code_chunks) > 0:
        print("\nSample of the first chunk:")
        import json
        print(json.dumps(code_chunks[0], indent=2))
    else:
        print("\nWARNING: 'code_chunks' is EMPTY. This means no functions were found in your corpus files.")
        print("Please check that you have .py files in your 'data/reference_corpus/' directory.")
else:
    print("\nERROR: The variable 'code_chunks' does NOT exist. The error happened earlier.")

In [None]:
import pickle
import os

METADATA_PATH = os.path.join('indexes', 'metadata.pkl')

print("--- Starting Metadata Save Process ---")

try:
    print(f"Attempting to save {len(code_chunks)} chunks to the file: {METADATA_PATH}")
    
    with open(METADATA_PATH, 'wb') as f:
        print("File opened successfully in write-binary mode ('wb').")
        
        pickle.dump(code_chunks, f)
        
        print("pickle.dump() command executed successfully.")

    print(f"File write operation complete. Checking if file exists now...")
    
    if os.path.exists(METADATA_PATH):
        print("\nSUCCESS: The file 'metadata.pkl' has been created successfully!")
    else:
        print("\nFAILURE: The file was NOT created, even though no error was thrown. This is unusual.")

except Exception as e:
    print("\n--- AN ERROR OCCURRED ---")
    print(f"An exception of type {type(e).__name__} was caught.")
    print(f"Error details: {e}")