In [1]:
import sys
import pysqlite3

sys.modules['sqlite3'] = pysqlite3

import os
import time
import warnings
import ollama
from functools import cached_property
from langchain_community.llms import AzureOpenAI
from langchain_community.document_loaders import PyPDFLoader,TextLoader
from langchain_text_splitters import (Language,RecursiveCharacterTextSplitter)
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
import google.generativeai as genai
from groq import Groq
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import requests
import voyageai
from langchain.retrievers import ContextualCompressionRetriever
from langchain_voyageai import VoyageAIEmbeddings,VoyageAIRerank
from tree_sitter_languages import get_language, get_parser
from llama_index.core.text_splitter import CodeSplitter
from dotenv import load_dotenv
import subprocess

from chunker import get_code_chunks

from dotenv import load_dotenv

load_dotenv(dotenv_path="../.env")

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata

def doc_merger(splits):
    current = 0
    while True:
        doc_lines = len(splits[current].splitlines())
        if doc_lines < 3:
            # merge with next doc
            splits[current] += splits[current + 1]
            splits.pop(current + 1)
        else:
            current += 1
        
        if current == len(splits) - 1:
            return splits

command = ["clang-format","-style={ColumnLimit: 300, AllowShortFunctionsOnASingleLine: All, AllowShortIfStatementsOnASingleLine: true}","-i","original.txt"]

subprocess.run(command, check=True)

file_path = "original.txt"
with open(file_path, "r") as f:
    docs = f.read()

splits = get_code_chunks(docs)
new_splits = [split for split in splits if len(split) > 2]
new_splits2 = doc_merger(new_splits)
# documents = [Document(page_content=split) for split in splits]
documents = [Document(page_content=split) for split in new_splits2]
# save documents to files
# for i, doc in enumerate(documents):
#     with open(f"docs/doc_{i}.txt", "w") as f:
#         f.write(doc.page_content)
os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY")
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
db = Chroma.from_documents(documents=documents, embedding=embeddings)


In [3]:
def combine_docs(docs):
    return "\n\n".join(f"Snippet.{i+1}:\n\n{doc.page_content}" for i, doc in enumerate(docs))


In [4]:
safe = [
{
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_NONE",
},
{
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_NONE",
},
]
genai.configure(api_key=os.environ.get("GENAI_API_KEY"))
generation_config = {
"temperature": 0.1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-flash",
generation_config=generation_config,
safety_settings = safe
)
llm = model.start_chat(history=[])

In [5]:
import gc

from sklearn.metrics.pairwise import cosine_similarity

os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY")


def call_retrieval_sada(pretext, fifty_clean):

    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

    retrieved_docs = retriever.invoke(pretext)

    formatted_context = combine_docs(retrieved_docs)

    formatted_context_2 = "Snippet.0: \n\n" + fifty_clean + "\n\n" + formatted_context

    return formatted_context_2

    # return formatted_context, [], retrieved_docs, []




### Test your cand sets below

##### Add `Pretext`, `Fifty_Text` and `Query` as variables

##### Original.txt, prompt, and sec_list must be in `manual_testing/` directory

In [20]:
llm = model.start_chat(history=[])

with open('prompt_in_coverage.txt', 'r') as file:
    prompt_template = file.read()

with open('sec_list.txt', 'r') as file:
    sec_list = file.read()


pretext = """
        }
        ntemps -= tmp___2;
        (files + out)->name = (char const *)(temp->name);
        (files + out)->temp = temp;
        in += num_merged;
        out++;
      }
    while_break___0:
      remainder = nfiles - in;
      cheap_slots = (unsigned long)nmerge - out % (unsigned long)nmerge;
      if (cheap_slots < remainder) {
        nshortmerge = (remainder - cheap_slots) + 1UL;
        tmp___3 = create_temp(&tfp___0);
        temp___0 = tmp___3;
        if (ntemps < nshortmerge) {
          tmp___4 = ntemps;
        } else {
          tmp___4 = nshortmerge;
        }
        tmp___5 = mergefiles(files + in, tmp___4, nshortmerge, tfp___0, (char const *)(temp___0->name));
        num_merged___0 = tmp___5;
        if (ntemps < num_merged___0) {
          tmp___6 = ntemps;
        } else {
          tmp___6 = num_merged___0;
        }
        ntemps -= tmp___6;
        (files + out)->name = (char const *)(temp___0->name);
        tmp___7 = out;
        out++;
        (files + tmp___7)->temp = temp___0;
        in += num_merged___0;
      }
      memmove((void *)(files + out), (void const *)(files + in), (nfiles - in) * sizeof(*files));
      ntemps += out;
      nfiles -= in - out;
    }
  while_break:
    avoid_trashing_input(files, ntemps, nfiles, output_file);
    while (1) {
      tmp___8 = open_input_files(files, nfiles, &fps);
      nopened = tmp___8;
      
static void sort(char *const *files, size_t nfiles, char const *output_file, size_t nthreads) {
  struct buffer buf___1;
  size_t ntemps;
  _Bool output_file_created;
  char const *temp_output;
  char const *file;"""

fifty_clean = """"""

query = """
  if (cheap_slots < remainder) {
    nshortmerge = (remainder - cheap_slots) + 1UL;
    tmp___3 = create_temp(&tfp___0);
    temp___0 = tmp___3;
    if (ntemps < nshortmerge) {
      tmp___4 = ntemps;
    } else {
      tmp___4 = nshortmerge;
    }
    tmp___5 = mergefiles(files + in, tmp___4, nshortmerge, tfp___0,
                         (char const *)(temp___0->name));
    num_merged___0 = tmp___5;
    if (ntemps < num_merged___0) {
      tmp___6 = ntemps;
    } else {
      tmp___6 = num_merged___0;
    }
    ntemps -= tmp___6;
    (files + out)->name = (char const *)(temp___0->name);
    tmp___7 = out;
    out++;
    (files + tmp___7)->temp = temp___0;
    in += num_merged___0;
  }
"""
formatted_context = call_retrieval_sada(pretext, fifty_clean)

prompt = prompt_template.format(sec_list=sec_list, formatted_context=formatted_context, query=query)
response = llm.send_message(prompt).text

print("\nRESPONSE:\n",response)



RESPONSE:
 Class 1: This code is not directly involved in multi-threading, input processing, sorting, or output. It's part of the merging logic, which is a step in the sorting process, but not the core functionality itself.
Class 2: This code is not strictly necessary for the basic sorting functionality. It handles a specific edge case during merging, where the number of files to be merged doesn't divide evenly by the number of merge slots.
Class 3: This code is necessary for the efficient and correct merging of sorted files. It ensures that all files are merged in a way that minimizes the number of temporary files created and optimizes the merging process.
Class 4: This code is not critical for the program's functionality. The program would still function without it, albeit with potentially less efficient merging.
Class 5: The context provided is sufficient to understand the purpose of this code. It's part of the merging logic within the sorting process.
Class 6: This code is used in

In [21]:
print(formatted_context)

Snippet.0: 



Snippet.1:

  }
}
static size_t mergefiles(struct sortfile *files, size_t ntemps, size_t nfiles, FILE *ofp, char const *output_file) {
  FILE **fps;
  size_t nopened;
  size_t tmp;
  char *tmp___0;

  {
    tmp = open_input_files(files, nfiles, &fps);
    nopened = tmp;
    if (nopened < nfiles) {
      if (nopened < 2UL) {
        tmp___0 = gettext("open failed");
        die((char const *)tmp___0, (files + nopened)->name);
      }
    }
    mergefps(files, ntemps, nopened, ofp, output_file, fps);
    return (nopened);
  }
}
static void mergelines(struct line *__restrict t, size_t nlines, struct line const *__restrict lo) {
  size_t nlo;
  size_t nhi;
  struct line *hi;
  int tmp;

  {
    nlo = nlines / 2UL;
    nhi = nlines - nlo;
    hi = (struct line *)(t - nlo);
    while (1) {
      tmp = compare((struct line const *)(lo - 1), (struct line const *)(hi - 1));
      if (tmp <= 0) {
        t--;
        lo--;
        *t = (struct line) * lo;
        nlo--;
        if