In [2]:
import json
import re

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import USearch
from langchain.document_loaders import ArxivLoader

from utils.paper_utils import tfidf_similarity

In [3]:
def reformat_text(doc_content):
    content = doc_content.replace('-\n', '')
    content = re.sub(r'(?<!\n)\n(?!\n)', ' ', content)
    content = re.sub(' +', ' ', content)
    return content
    
def preprocess(text):
    """ Clean and simplify text string. """
    text = ''.join(c.lower() if c.isalnum() else ' ' for c in text)
    return text

In [4]:
## Mapping file.
with open("../arxiv_code_map.json", 'r') as f:
    arxiv_code_map = json.load(f)

In [17]:
for k, v in arxiv_code_map.items():
    docs = ArxivLoader(query=preprocess(v), load_max_docs=4).load()
    docs = sorted(docs, key=lambda x: tfidf_similarity(v, x.metadata["Title"]),
                  reverse=True)
    new_title = docs[0].metadata["Title"]
    title_sim = tfidf_similarity(v, new_title)
    if title_sim < 0.7:
        print(f"No similar title name found for {v}.")
        continue
    ## Conteineria.
    doc_content = docs[0].page_content
    # doc_content = reformat_text(doc_content)
    with open(f"../data/arxiv/{k}.txt", 'w') as f:
        f.write(doc_content)

No similar title name found for One-for-All: Generalized LoRA for Parameter-Efficient Fine-tuning.
No similar title name found for Invariant Language Modeling.


TimeoutError: [Errno 60] Operation timed out

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    length_function = len,
    add_start_index = True,
    chunk_overlap=20
)
split_docs = text_splitter.create_documents([doc_content])

In [9]:
## Embeddings.
embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

## Store.
db = USearch.from_documents(split_docs, embeddings)

In [13]:
db.similarity_search_with_score("example prompts")

[(Document(page_content='that reproduces this issue”, the start of a block of code in Markdown, (i.e., ```), and finally the partial code snippet public void test whose role is to induce the LLM to write a test method. TABLE II: Example bug report (Defects4J Math-63). Issue No. MATH-3701 Title NaN in “equals” methods Description In “MathUtils”, some “equals” methods will return true if both argument are NaN. Unless I’m mistaken, this contradicts the IEEE standard. If nobody objects, I’m going to make the changes. Listing 1: Example prompt without examples. 1 # NaN in "equals" methods 2 ## Description 3 In "MathUtils", some "equals" methods will return true if both argument are NaN. 4 Unless I\'m mistaken, this contradicts the IEEE standard. 5 If nobody objects, I\'m going to make the changes. 6 7 ## Reproduction 8 >Provide a self-contained example that reproduces this issue. 9 ``` 10 public void test We evaluate a range of variations of this basic prompt. Brown et al. [11] report that 

In [10]:
new_title

'Large Language Models are Few-shot Testers: Exploring LLM-based General Bug Reproduction'