In [1]:
import sys
import pysqlite3

sys.modules['sqlite3'] = pysqlite3

import os
import time
import warnings
import ollama
from functools import cached_property
from langchain_community.llms import AzureOpenAI
from langchain_community.document_loaders import PyPDFLoader,TextLoader
from langchain_text_splitters import (Language,RecursiveCharacterTextSplitter)
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
import google.generativeai as genai
from groq import Groq
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import requests
import voyageai
from langchain.retrievers import ContextualCompressionRetriever
from langchain_voyageai import VoyageAIEmbeddings,VoyageAIRerank
from tree_sitter_languages import get_language, get_parser
from llama_index.core.text_splitter import CodeSplitter
from dotenv import load_dotenv
import subprocess

from chunker import get_code_chunks

from dotenv import load_dotenv

load_dotenv(dotenv_path="../.env")

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata

def doc_merger(splits):
    current = 0
    while True:
        doc_lines = len(splits[current].splitlines())
        if doc_lines < 3:
            # merge with next doc
            splits[current] += splits[current + 1]
            splits.pop(current + 1)
        else:
            current += 1
        
        if current == len(splits) - 1:
            return splits

command = ["clang-format","-style={ColumnLimit: 300, AllowShortFunctionsOnASingleLine: All, AllowShortIfStatementsOnASingleLine: true}","-i","original.txt"]

subprocess.run(command, check=True)

file_path = "original.txt"
with open(file_path, "r") as f:
    docs = f.read()

splits = get_code_chunks(docs)
new_splits = [split for split in splits if len(split) > 2]
new_splits2 = doc_merger(new_splits)
# documents = [Document(page_content=split) for split in splits]
documents = [Document(page_content=split) for split in new_splits2]
# save documents to files
# for i, doc in enumerate(documents):
#     with open(f"docs/doc_{i}.txt", "w") as f:
#         f.write(doc.page_content)
os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY")
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
db = Chroma.from_documents(documents=documents, embedding=embeddings)


In [3]:
def combine_docs(docs):
    return "\n\n".join(f"Snippet.{i+1}:\n\n{doc.page_content}" for i, doc in enumerate(docs))


In [4]:
safe = [
{
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_NONE",
},
]
genai.configure(api_key=os.environ.get("GENAI_API_KEY"))
generation_config = {
"temperature": 0.1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-flash",
generation_config=generation_config,
safety_settings = safe
)
llm = model.start_chat(history=[])

In [5]:
import gc

from sklearn.metrics.pairwise import cosine_similarity

os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY")


def call_retrieval_sada(pretext, fifty_clean):

    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

    retrieved_docs = retriever.invoke(pretext)

    formatted_context = combine_docs(retrieved_docs)

    formatted_context_2 = "Snippet.0: \n\n" + fifty_clean + "\n\n" + formatted_context

    return formatted_context_2

    # return formatted_context, [], retrieved_docs, []




### Test your cand sets below

##### Add `Pretext`, `Fifty_Text` and `Query` as variables

##### Original.txt, prompt, and sec_list must be in `manual_testing/` directory

In [13]:
llm = model.start_chat(history=[])

with open('prompt_in_coverage.txt', 'r') as file:
    prompt_template = file.read()

with open('sec_list.txt', 'r') as file:
    sec_list = file.read()


pretext = """while (1) {
              if (to_lowcase) {
                fwrite_lowcase(p, (char const *)(ubuf + 1), n__2);
              } else {
                if (to_uppcase) {
                  fwrite_uppcase(p, (char const *)(ubuf + 1), n__2);
                } else {
                  fwrite((void const *)(ubuf + 1), n__2, (size_t)1, p);
                }
              }
              goto while_break___22;
            }
fputc(' ', p);
                      i__6++;
                    }
                  while_break___21:;
                    goto while_break___20;
                  }
                while_break___20:;
                }
              }
            }
            while (1) {

              if (to_lowcase) {
                fwrite_lowcase(p, (char const *)(ubuf + 1), n__2);
              } else {
                if (to_uppcase) {
                  fwrite_uppcase(p, (char const *)(ubuf + 1), n__2);
                } else {
                  fwrite((void const *)(ubuf + 1), n__2, (size_t)1, p);
                }
              }
              goto while_break___22;
            }
          while_break___22:;
          }
          i += incr__2;
          goto while_break___17;
        }
      while_break___17:;
      }
      goto switch_break___1;
    case_67:
      if (modifier == 79) {
        goto bad_format;
      }"""

fifty_clean = """"""

query = """while (1) {
              if (to_lowcase) {
                fwrite_lowcase(p, (char const *)(ubuf + 1), n__2);
              } else {
                if (to_uppcase) {
                  fwrite_uppcase(p, (char const *)(ubuf + 1), n__2);
                } else {
                  fwrite((void const *)(ubuf + 1), n__2, (size_t)1, p);
                }
              }
              goto while_break___22;
            }"""
formatted_context = call_retrieval_sada(pretext, fifty_clean)

prompt = prompt_template.format(sec_list=sec_list, formatted_context=formatted_context, query=query)
response = llm.send_message(prompt).text

print("\nRESPONSE:\n",response)



RESPONSE:
 Class 1: This code is not directly related to input preprocessing, parsing, or date calculations. It appears to be involved in output formatting.
Class 2: This code might be somewhat unnecessary if the program only supports a limited set of output formats. However, it is likely needed for more complex formatting options.
Class 3: This code is necessary for the program to handle different output formatting options, including case sensitivity.
Class 4: This code is not critical for the program's core functionality. The program would still function without it, but the output might not be formatted as intended.
Class 5: The context provided is sufficient to understand the purpose of this code.
Class 6: This code is used in the provided context.

Explanation: The code snippet is part of a larger loop that handles output formatting. It checks for flags indicating whether the output should be in lowercase, uppercase, or the original case. This code is necessary for the program to 