In [1]:
import os
import time
import warnings
import ollama
from functools import cached_property
from langchain_community.llms import AzureOpenAI
from langchain_community.document_loaders import PyPDFLoader,TextLoader
from langchain_text_splitters import (Language,RecursiveCharacterTextSplitter)
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
import google.generativeai as genai
from groq import Groq
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import requests
import voyageai
from langchain.retrievers import ContextualCompressionRetriever
from langchain_voyageai import VoyageAIEmbeddings,VoyageAIRerank
from tree_sitter_languages import get_language, get_parser
from llama_index.core.text_splitter import CodeSplitter
from dotenv import load_dotenv
import subprocess

from chunker import get_code_chunks

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata

def doc_merger(splits):
    current = 0
    while True:
        doc_lines = len(splits[current].splitlines())
        if doc_lines < 3:
            # merge with next doc
            splits[current] += splits[current + 1]
            splits.pop(current + 1)
        else:
            current += 1
        
        if current == len(splits) - 1:
            return splits

command = ["clang-format","-style={ColumnLimit: 300, AllowShortFunctionsOnASingleLine: All, AllowShortIfStatementsOnASingleLine: true}","-i","original.txt"]

subprocess.run(command, check=True)

file_path = "original.txt"
with open(file_path, "r") as f:
    docs = f.read()

splits = get_code_chunks(docs)
new_splits = [split for split in splits if len(split) > 2]
new_splits2 = doc_merger(new_splits)
# documents = [Document(page_content=split) for split in splits]
documents = [Document(page_content=split) for split in new_splits2]
# save documents to files
# for i, doc in enumerate(documents):
#     with open(f"docs/doc_{i}.txt", "w") as f:
#         f.write(doc.page_content)
os.environ["VOYAGE_API_KEY"] = os.environ.get("VOYAGE_API_KEY")
os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY")
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
db = Chroma.from_documents(documents=documents, embedding=embeddings)


In [3]:
def combine_docs(docs):
    return "\n\n".join(f"Snippet.{i+1}:\n\n{doc.page_content}" for i, doc in enumerate(docs))


In [4]:
safe = [
{
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_NONE",
},
]
genai.configure(api_key=os.environ.get("GENAI_API_KEY"))
generation_config = {
"temperature": 0.1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-flash",
generation_config=generation_config,
safety_settings = safe
)
llm = model.start_chat(history=[])

In [5]:
import gc

from sklearn.metrics.pairwise import cosine_similarity

os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY")


def call_retrieval_sada(pretext, fifty_clean):

    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

    retrieved_docs = retriever.invoke(pretext)

    formatted_context = combine_docs(retrieved_docs)

    formatted_context_2 = "Snippet.0: \n\n" + fifty_clean + "\n\n" + formatted_context

    return formatted_context_2

    # return formatted_context, [], retrieved_docs, []




### Test your cand sets below

##### Add `Pretext`, `Fifty_Text` and `Query` as variables

##### Original.txt, prompt, and sec_list must be in `manual_testing/` directory

In [16]:
llm = model.start_chat(history=[])

with open('single_cand_prompt.txt', 'r') as file:
    prompt_template = file.read()

with open('sec_list.txt', 'r') as file:
    sec_list = file.read()


pretext = """size_t w__5;
    int tmp___19;
    size_t incr__5;
    size_t tmp___20;
    size_t delta__5;
    size_t i__12;
    size_t i__13;
    size_t i__14;
    size_t n__6;
    size_t w__6;
    int tmp___21;
    size_t incr__6;
    size_t tmp___22;
    size_t delta__6;
    size_t i__15;
    size_t i__16;
    size_t n__7;
    size_t w__7;
    int tmp___23;
    size_t incr__7;"""

fifty_clean = """size_t w__5;
    int tmp___19;
    size_t incr__5;
    size_t tmp___20;
    size_t delta__5;
    size_t i__12;
    size_t i__13;
    size_t i__14;
    size_t n__6;
    size_t w__6;
    int tmp___21;
    size_t incr__6;
    size_t tmp___22;
    size_t delta__6;
    size_t i__15;
    size_t i__16;
    size_t n__7;
    size_t w__7;
    int tmp___23;
    size_t incr__7;
    size_t tmp___24;
    size_t delta__7;
    size_t i__17;
    size_t i__18;
    int j;
    size_t n__8;
    size_t w__8;
    int tmp___25;
    size_t incr__8;
    size_t tmp___26;
    size_t delta__8;
    size_t i__19;
    size_t i__20;
    struct tm ltm;
    time_t t;
    int d;
    int tmp___27;
    size_t n__9;
    size_t w__9;
    int tmp___28;
    size_t incr__9;
    size_t tmp___29;
    size_t delta__9;
    size_t i__21;
    size_t i__22;
    int year___1;
    int tmp___30;
    int year_adjust;
    int days;
    int tmp___31;
    int tmp___32;
    int d___0;
    int tmp___33;
    int tmp___34;
    int yy;
    int tmp___35;
    int yy___0;
    size_t n__10;
    size_t tmp___36;
    size_t w__10;
    int tmp___37;
    size_t incr__10;
    size_t tmp___38;
    size_t delta__10;
    size_t i__23;
    size_t i__24;
    int diff;
    int hour_diff;
    int min_diff;
    int sec_diff;
    int flen;
    size_t n__11;
    size_t w__11;
    int tmp___39;
    size_t incr__11;
    size_t tmp___40;
    size_t delta__11;
    size_t i__25;
    size_t i__26;"""

query = """int tmp___21;"""
formatted_context = call_retrieval_sada(pretext, fifty_clean)

prompt = prompt_template.format(sec_list=sec_list, formatted_context=formatted_context, query=query)
response = llm.send_message(prompt).text

print("\nRESPONSE:\n",response)



RESPONSE:
 Class 1: This code is not directly involved in date formatting, calculation, input processing, parsing, or output formatting. It does not contribute to any of the required functionalities.
Class 2: This code is not directly involved in date formatting, calculation, input processing, parsing, or output formatting. It does not contribute to any of the required functionalities.
Class 3: This code is not directly involved in date formatting, calculation, input processing, parsing, or output formatting. It does not contribute to any of the required functionalities.
Class 4: This code is a variable declaration, which is crucial for defining the type and size of data, allocating memory, and preventing errors. While its specific purpose in this context is unclear, it is safer to keep it than to remove it.

Explanation: The code snippet is a variable declaration, which is essential for program structure and potentially for the overall functionality. However, without more context, it