In [73]:
import sys
import pysqlite3

sys.modules['sqlite3'] = pysqlite3

import os
import time
import warnings
import ollama
from functools import cached_property
from langchain_community.llms import AzureOpenAI
from langchain_community.document_loaders import PyPDFLoader,TextLoader
from langchain_text_splitters import (Language,RecursiveCharacterTextSplitter)
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
import google.generativeai as genai
from groq import Groq
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import requests
import voyageai
from langchain.retrievers import ContextualCompressionRetriever
from langchain_voyageai import VoyageAIEmbeddings,VoyageAIRerank
from tree_sitter_languages import get_language, get_parser
from llama_index.core.text_splitter import CodeSplitter
from dotenv import load_dotenv
import subprocess

from chunker import get_code_chunks

from dotenv import load_dotenv

load_dotenv(dotenv_path="../.env")

True

In [2]:
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata

def doc_merger(splits):
    current = 0
    while True:
        doc_lines = len(splits[current].splitlines())
        if doc_lines < 3:
            # merge with next doc
            splits[current] += splits[current + 1]
            splits.pop(current + 1)
        else:
            current += 1
        
        if current == len(splits) - 1:
            return splits

command = ["clang-format","-style={ColumnLimit: 300, AllowShortFunctionsOnASingleLine: All, AllowShortIfStatementsOnASingleLine: true}","-i","original.txt"]

subprocess.run(command, check=True)

file_path = "original.txt"
with open(file_path, "r") as f:
    docs = f.read()

splits = get_code_chunks(docs)
new_splits = [split for split in splits if len(split) > 2]
new_splits2 = doc_merger(new_splits)
# documents = [Document(page_content=split) for split in splits]
documents = [Document(page_content=split) for split in new_splits2]
# save documents to files
# for i, doc in enumerate(documents):
#     with open(f"docs/doc_{i}.txt", "w") as f:
#         f.write(doc.page_content)
os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY")
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
db = Chroma.from_documents(documents=documents, embedding=embeddings)


In [3]:
def combine_docs(docs):
    return "\n\n".join(f"Snippet.{i+1}:\n\n{doc.page_content}" for i, doc in enumerate(docs))


In [93]:
safe = [
{
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_NONE",
},
]
genai.configure(api_key=os.environ.get("GENAI_API_KEY"))
generation_config = {
"temperature": 0.1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-pro",
generation_config=generation_config,
safety_settings = safe
)
llm = model.start_chat(history=[])

In [5]:
import gc

from sklearn.metrics.pairwise import cosine_similarity

os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY")


def call_retrieval_sada(pretext, fifty_clean):

    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

    retrieved_docs = retriever.invoke(pretext)

    formatted_context = combine_docs(retrieved_docs)

    formatted_context_2 = "Snippet.0: \n\n" + fifty_clean + "\n\n" + formatted_context

    return formatted_context_2

    # return formatted_context, [], retrieved_docs, []




### Test your cand sets below

##### Add `Pretext`, `Fifty_Text` and `Query` as variables

##### Original.txt, prompt, and sec_list must be in `manual_testing/` directory

In [1]:
llm = model.start_chat(history=[])

with open('prompt_in_coverage.txt', 'r') as file:
    prompt_template = file.read()

with open('sec_list.txt', 'r') as file:
    sec_list = file.read()


pretext = """
        }
        ntemps -= tmp___2;
        (files + out)->name = (char const *)(temp->name);
        (files + out)->temp = temp;
        in += num_merged;
        out++;
      }
    while_break___0:
      remainder = nfiles - in;
      cheap_slots = (unsigned long)nmerge - out % (unsigned long)nmerge;
      if (cheap_slots < remainder) {
        nshortmerge = (remainder - cheap_slots) + 1UL;
        tmp___3 = create_temp(&tfp___0);
        temp___0 = tmp___3;
        if (ntemps < nshortmerge) {
          tmp___4 = ntemps;
        } else {
          tmp___4 = nshortmerge;
        }
        tmp___5 = mergefiles(files + in, tmp___4, nshortmerge, tfp___0, (char const *)(temp___0->name));
        num_merged___0 = tmp___5;
        if (ntemps < num_merged___0) {
          tmp___6 = ntemps;
        } else {
          tmp___6 = num_merged___0;
        }
        ntemps -= tmp___6;
        (files + out)->name = (char const *)(temp___0->name);
        tmp___7 = out;
        out++;
        (files + tmp___7)->temp = temp___0;
        in += num_merged___0;
      }
      memmove((void *)(files + out), (void const *)(files + in), (nfiles - in) * sizeof(*files));
      ntemps += out;
      nfiles -= in - out;
    }
  while_break:
    avoid_trashing_input(files, ntemps, nfiles, output_file);
    while (1) {
      tmp___8 = open_input_files(files, nfiles, &fps);
      nopened = tmp___8;
      
static void sort(char *const *files, size_t nfiles, char const *output_file, size_t nthreads) {
  struct buffer buf___1;
  size_t ntemps;
  _Bool output_file_created;
  char const *temp_output;
  char const *file;"""

fifty_clean = """"""

query = """
  if (cheap_slots < remainder) {
    nshortmerge = (remainder - cheap_slots) + 1UL;
    tmp___3 = create_temp(&tfp___0);
    temp___0 = tmp___3;
    if (ntemps < nshortmerge) {
      tmp___4 = ntemps;
    } else {
      tmp___4 = nshortmerge;
    }
    tmp___5 = mergefiles(files + in, tmp___4, nshortmerge, tfp___0,
                         (char const *)(temp___0->name));
    num_merged___0 = tmp___5;
    if (ntemps < num_merged___0) {
      tmp___6 = ntemps;
    } else {
      tmp___6 = num_merged___0;
    }
    ntemps -= tmp___6;
    (files + out)->name = (char const *)(temp___0->name);
    tmp___7 = out;
    out++;
    (files + tmp___7)->temp = temp___0;
    in += num_merged___0;
  }
"""
formatted_context = call_retrieval_sada(pretext, fifty_clean)

prompt = prompt_template.format(sec_list=sec_list, formatted_context=formatted_context, query=query)

response = llm.send_message(prompt).text

print("\nRESPONSE:\n",response)


NameError: name 'model' is not defined

In [97]:
llm = model.start_chat(history=[])


import subprocess
import os
import re
# Run a command and capture its output

c_program = "tar-util.c"
test_oracle = "tar.sh"

# c_program = "chown-debloated.c"
# test_oracle = "chown_train.sh"

# c_program = "date-util.c"
# test_oracle = "date.sh"
binary_name = c_program.split(".")[0]
os.system(f'clang -w {c_program} -D __msan_unpoison\(s,z\) -lpcre -lpthread -o {binary_name}')

output = subprocess.check_output([f"""./{binary_name}""", '--help'])
functionality_list = output.decode('utf-8')

with open('functionality_prompt.txt', 'r') as file:
    func_template = file.read()

with open(test_oracle, 'r') as file:
    train_oracle = file.read()

func_prompt = func_template.format(functionality_list = functionality_list, train_oracle=train_oracle)

# print(func_prompt)
# print("##################")
response = llm.send_message(func_prompt).text

# response = re.sub(r"\*\*Reasoning:\*\*.*\n", "", response)


print(response)

## Required Functionality:

**Functionality 1:** 
    * **Command:** `tar -cf archive.tar -C temp .`
    * **Purpose:** Creates a new archive file named "archive.tar" in the current directory, containing all files and directories within the "temp" directory. 
    * **Reasoning:** This functionality is used in both test cases within the `run()` function to create an archive using the reduced binary.
        * Test case 1: Compresses "test1.txt" into "archive.tar".
        * Test case 2: Compresses "test2.txt", "test3.bin", and "test4.txt" into "archive.tar".

**Parsing and formats:** The reduced binary needs to understand the `-cf` flags for creating an archive and specifying the filename. It also needs to correctly interpret the directory structure provided by `-C temp .` to include all necessary files and directories in the archive.


## Unrequired Functionality (to be removed):

1. **File Extraction (`-xf`)**: The provided script only utilizes the `REDUCED_BINARY` for creating archiv

In [91]:
follow_up_prompt = f"""Now your job is to analyse the program code for the utility given below, and identify which parts of this programs can be removed or should be retained to maintain the given Required and Unrequired Functionality.

Program Code:


"""

llm.send_message("").text

'You are absolutely right! My apologies for the mistake. I incorrectly identified `-xf` decompression as a required functionality for the `REDUCED_BINARY`. \n\nThe `run()` function uses the `REDUCED_BINARY` only for the `-cf` compression operation. The `-xf` decompression is performed by the `ORG_BINARY` for comparison purposes. \n\nTherefore, the **required functionality** should only include:\n\n**Functionality:** `./tar-util -cf archive.tar -C temp .`\n**Purpose:** Compresses files located in the `temp` directory into a tar archive named `archive.tar`.\n**Reasoning:** This functionality is used in the `run()` function to compress the files in the `temp` directory using the `REDUCED_BINARY`.\n\nThe **unrequired functionality** remains the same as previously listed.\n\nI apologize for the confusion caused by my previous response. I am still under development and learning to better understand complex code structures. Thank you for pointing out my error! \n'

In [72]:
import re

def merge_lines_with_semicolon(text):
    lines = text.splitlines()
    result = []
    current_line = ""
    
    for line in lines:
        stripped_line = line.strip()
        
        if not current_line:
            current_line = line
        elif '=' in current_line and not current_line.strip().endswith(';'):
            current_line += ' ' + stripped_line
        else:
            result.append(current_line)
            current_line = line
        
        if current_line.strip().endswith(';'):
            result.append(current_line)
            current_line = ""
    
    if current_line:
        result.append(current_line)
    
    return '\n'.join(result)

# Test the function
text = '''tmp___11 =
 mktime_ok((struct tm const *)(&tm0), (struct tm const *)(&tm), Start);
 if (!tmp___11) {
 goto fail;
 }
}'''

result = merge_lines_with_semicolon(text)
print(result)

tmp___11 = mktime_ok((struct tm const *)(&tm0), (struct tm const *)(&tm), Start);
 if (!tmp___11) {
 goto fail;
 }
}
