In [2]:
import os
import sys
import pandas as pd
from time import sleep
from llama_cpp import Llama
from pathlib import Path

In [3]:
import xml.etree.ElementTree as ET
from src.prompt_generation import generate_prompt_files

# Parsing the Element Tree
tree = ET.parse("Design Pattern List v1.2.xml")
root = tree.getroot()
ET.indent(root)

generate_prompt_files(root, "adapter", True, True, 0)

3 - JRefactory v2.6.24 adapter source-codes/3 - JRefactory v2.6.24/src/org/acm/seguin/ide/jbuilder/RefactoryNodeViewerFactory
3 - JRefactory v2.6.24 adapter source-codes/3 - JRefactory v2.6.24/src/org/acm/seguin/print/xml/State
3 - JRefactory v2.6.24 adapter source-codes/3 - JRefactory v2.6.24/src/org/acm/seguin/ide/common/action/ExtractMethodAction
3 - JRefactory v2.6.24 adapter source-codes/3 - JRefactory v2.6.24/src/org/acm/seguin/parser/ast/ASTSwitchLabel
3 - JRefactory v2.6.24 adapter source-codes/3 - JRefactory v2.6.24/src/org/acm/seguin/ide/jbuilder/TextStructureDelegate
3 - JRefactory v2.6.24 adapter source-codes/3 - JRefactory v2.6.24/src/org/acm/seguin/print/PagePrinter
3 - JRefactory v2.6.24 adapter source-codes/3 - JRefactory v2.6.24/src/org/acm/seguin/uml/line/Segment
3 - JRefactory v2.6.24 adapter source-codes/3 - JRefactory v2.6.24/src/org/acm/seguin/refactor/type/RenameClassVisitor
3 - JRefactory v2.6.24 adapter source-codes/3 - JRefactory v2.6.24/src/org/acm/seguin/par

In [4]:
def find_file_in_subdir(parent_dir: str, extension: str = ".gguf"):
    """
    Find a file matching pattern in any subdirectory, given a parent directory
    """
    parent = Path(parent_dir)

    # Search Recursively for the file pattern
    files = list(parent.glob(f"*/*{extension}"))

    if not files:
        raise FileNotFoundError(
            f"No files found matching {extension} pattern in {parent_dir}"
        )

    if len(files) > 1:
        print(f"Warning: Found {len(files)} files, using first file")

    return str(files[0])

In [5]:
description_generator_model_path = os.path.join(
    "models",
    "--".join(["models"] + "Qwen/Qwen2.5-3B-Instruct-GGUF".split(os.path.sep)),
    "snapshots",
)

description_generator_model_path

'models/models--Qwen--Qwen2.5-3B-Instruct-GGUF/snapshots'

In [6]:
desc_gen_model = Llama(
    model_path = find_file_in_subdir(description_generator_model_path),
    chat_format="chatml",
    seed=42,
    n_ctx=35000,
    n_threads=os.cpu_count() // 2,
    n_threads_batch = os.cpu_count() // 2,
    use_mlock=True,
    use_mmap=True,
    verbose=False,
)

llama_context: n_ctx_per_seq (35000) > n_ctx_train (32768) -- possible training context overflow


In [7]:
model_output_path = os.path.join(
    "description-outputs",
)
model_output_path

'description-outputs'

In [8]:
import re

def remove_comments(string: str) -> str:
    """
    Function to remove comments from a provided string (using regex)

    INPUT :
        - string -> code to be uncommented

    OUTPUT :
        - uncommented code (str)
    """

    pattern = r"(\".*?\"|\'.*?\')|(/\*.*?\*/|//[^\r\n]*$)"
    # first group captures quoted strings (double or single)
    # second group captures comments (//single-line or /* multi-line */)
    regex = re.compile(pattern, re.MULTILINE | re.DOTALL)

    def _replacer(match):
        # if the 2nd group (capturing comments) is not None,
        # it means we have captured a non-quoted (real) comment string.
        if match.group(2) is not None:
            return ""  # so we will return empty to remove the comment
        else:  # otherwise, we will return the 1st group
            return match.group(1)  # captured quoted-string

    return regex.sub(_replacer, string)

In [13]:
if not os.path.exists(os.path.join(model_output_path)):
    os.makedirs(model_output_path)

codes_path = "codes"
for correctness in os.listdir(codes_path):
    correctness_path = os.path.join(codes_path, correctness)
    for pattern_name in os.listdir(correctness_path):
        pattern_path = os.path.join(correctness_path, pattern_name)
        for role in os.listdir(pattern_path):
            role_path = os.path.join(pattern_path, role)
            for code_filename in os.listdir(role_path):
                # Final Output Path
                output_dir = os.path.join(
                    model_output_path,
                    correctness, 
                    pattern_name,
                    role,
                )
                output_path = os.path.join(output_dir, code_filename)
                
                if os.path.isfile(output_path):
                    print(f"Skipping {output_path}")
                    continue
                
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                
                # Path for codes stored
                code_filepath = os.path.join(role_path, code_filename)
                print(f"Opening {code_filepath}...")
                with open(code_filepath, "r") as code_file:
                    code = code_file.read()
                    code = remove_comments(code)
                    # print(code)
                
                desc_model_response = desc_gen_model.create_chat_completion(
                    messages = [
                        {
                            "role": "system",
                            "content": "You are an AI code description generator, tasked with reading codes and generating descriptions of methods and variables."
                        },
                        {
                            "role": "user",
                            "content": f"Read the provided code and generate a brief yet complete description of all the methods (including the arguments, what it does and what the output is) and variables.\n\nCode: {code}"
                        }
                    ],
                    # max_tokens=512,
                    stream=False
                )
                
                # print(desc_model_response["choices"][0]["message"]["content"])
                with open(output_path, "w") as output_file:
                    output_file.write(desc_model_response["choices"][0]["message"]["content"])
                
                sleep(60)

Skipping description-outputs/correct/adapter/adapter/6 - JHotDraw v5.1- SouthHandle.txt
Skipping description-outputs/correct/adapter/adapter/6 - JHotDraw v5.1- RadiusHandle.txt
Skipping description-outputs/correct/adapter/adapter/10 - Nutch v0.4- Page.txt
Skipping description-outputs/correct/adapter/adapter/11 - PMD v1.8- AbstractRule.txt
Skipping description-outputs/correct/adapter/adapter/3 - JRefactory v2.6.24- SourceBrowserAdapter.txt
Skipping description-outputs/correct/adapter/adapter/3 - JRefactory v2.6.24- ReloadActionAdapter.txt
Skipping description-outputs/correct/adapter/adapter/3 - JRefactory v2.6.24- ZoomAdapter.txt
Skipping description-outputs/correct/adapter/adapter/6 - JHotDraw v5.1- PolyLineHandle.txt
Skipping description-outputs/correct/adapter/adapter/6 - JHotDraw v5.1- GroupHandle.txt
Skipping description-outputs/correct/adapter/adapter/8 - MapperXML v1.9.7- DOM_1_20000929_DocumentAdapter.txt
Skipping description-outputs/correct/adapter/adapter/6 - JHotDraw v5.1- Ea

In [None]:
("description-outputs/correct/adapter/adaptee".split(os.path.sep)[-1])

'adaptee'

In [None]:
def generate_desc_prompt_files():
    description_dir = "description-outputs"
    output_dir = "./prompts-summary"

    base_prompt = None
    with open("description-prompt.txt", "r") as base_prompt_file:
        base_prompt = base_prompt_file.read()

    for dir, dirnames, filenames in os.walk("description-outputs"):
        if filenames:
            for filename in filenames:
                desc_filepath = os.path.join(dir, filename)
                with open(desc_filepath, "r") as desc_file:
                    code_desc = desc_file.read()

                output_file_dir = os.path.join(
                    output_dir,
                    os.path.sep.join(dir.split(os.path.sep)[1:]),
                ) 
                if not os.path.exists(output_file_dir):
                    os.makedirs(output_file_dir)

                output_filepath = os.path.join(output_file_dir, filename)
                with open(output_filepath, "w") as output_file:
                    output_file.write(
                        base_prompt.format(
                            code=code_desc,
                            role=dir.split(os.path.sep)[-1],
                            pattern=dir.split(os.path.sep)[2],
                        )
                    )


generate_desc_prompt_files()

NameError: name 'os' is not defined