In [30]:
%load_ext autoreload
import os

os.chdir(os.path.abspath(os.curdir).replace('notebooks',''))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
from src.utils import files_in_dir

In [10]:
# import
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Weaviate
from langchain.document_loaders import TextLoader, JSONLoader
import weaviate
# load the document and split it into chunks
loader = TextLoader("../data/stories/donaldson1/04_2_section_plot_prompt.txt")
documents = loader.load()

# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
WEAVIATE_URL = 'http://localhost:8080'
#embeddings = embedding_function.embed_documents([d.page_content for d in docs])
# load it into Chroma
client = weaviate.Client(url=WEAVIATE_URL)
wv = Weaviate(client, index_name = 'test', text_key = 'body')
db = wv.from_documents(docs, embedding_function, weaviate_url=WEAVIATE_URL, by_text=False)
# query it
query = "Where are the squirrels"
docs = db.similarity_search(query)

# print results
print(docs[0].page_content)

Scene 7: The animal crew cheer and help Oliver dig, using their paws, claws, and beaks to excavate the dirt and sand. Finally, they hit something solid and metallic.

Scene 8: "It's the chest! We found it!" Oliver yowls, jumping up and down with excitement. They open the chest with a creak, and inside they find piles of gold coins, jewels, and shiny trinkets.

Scene 9: "We did it! We found the treasure!" Polly sings, twirling around in joy. "And it's all thanks to our teamwork and brainwork."

Scene 10: "Arrr, that was a tricky map, indeed," Salty pants, taking a gulp of water from his bottle. "But we cracked it like a nut!" 

Scene 11: Oliver grins, feeling proud of his animal crew. They might not be real pirates, but they are definitely brave, clever, and loyal friends. "Let's celebrate our victory with some fish and milk," he meows, and they all feast on their well-deserved reward.


In [15]:
pip install jq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting jq
  Downloading jq-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.1/365.1 kB[0m [31m626.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: jq
Successfully installed jq-1.4.1
Note: you may need to restart the kernel to use updated packages.


In [59]:
# import
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Weaviate
from langchain.document_loaders import TextLoader, JSONLoader
import weaviate
from tqdm import tqdm
ls_files = files_in_dir('./data/arxiv/clean_txt/',['.json'])

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
WEAVIATE_URL = 'http://localhost:8080'
#embeddings = embedding_function.embed_documents([d.page_content for d in docs])
# load it into Chroma
client = weaviate.Client(url=WEAVIATE_URL)
wv = Weaviate(client, index_name = 'rag0', text_key = 'body')
all_docs = []
for path in tqdm(ls_files):
    loader = JSONLoader(path, jq_schema= '.[].body')
    documents = loader.load()

    # split it into chunks
    text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=1000, chunk_overlap=200)
    docs = text_splitter.split_documents(documents)
    all_docs = all_docs + docs

db = wv.from_documents(all_docs, embedding_function, weaviate_url=WEAVIATE_URL, by_text=False)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 23.24it/s]


In [60]:
# query it
query = "Retrieval augmented generation"
docs = db.similarity_search(query)

# print results
print(docs[0].page_content)

In this section, we formally deﬁne single-time retrieval-augmented generation and propose the framework of active retrieval augmented generation that decides when and what to retrieve throughout the generation.


In [32]:
import os
with open('openai.key','r') as f:
    openai_api_key = f.read()
os.environ['OPENAI_API_KEY'] = openai_api_key 

In [33]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

In [87]:
from langchain.chains.question_answering import load_qa_chain
from langchain import PromptTemplate
# Prompt
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Follow any additional instructions provided in the question section when forming the answer. For instance, "use bullet points" or "use simple language".

{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

In [94]:

qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0, max_tokens = 1000,), 
                             chain_type="stuff", 
                             retriever=db.as_retriever(),
                             chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
                             return_source_documents=True)

In [108]:
query = 'in bullet points, provide examples of retrieval augmented generation setups'
result = qa({"query": query})


In [110]:
print(result['result'])

 

• Sparse-vector Retrieval
• Dense-vector Retrieval
• Task-specific Retrieval
• Retrieval Memory
• Data Augmentation
• Attention Mechanism
• Skeleton & Templates
• Information Retrieval


In [105]:
ls_answ = result['result'].split('\n')

In [111]:

for q in ls_answ:
    if len(q)>2:
        query = f'in a single sentence, explain retrieval augmented generation using {q}'
        r = qa({"query": query})
        print(r['result'])


 Retrieval augmented generation is a paradigm that uses sparse-vector retrieval to augment language models for downstream NLP tasks.
 Retrieval augmented generation is a framework that uses dense-vector retrieval to decide when and what to retrieve during generation.
 Retrieval augmented generation is a process of using task-specific retrieval to supplement a language model in order to generate text.
 Retrieval augmented generation is a framework that uses a retrieval memory to decide when and what to retrieve throughout the generation process.
 Retrieval Augmented Generation is a method of data augmentation that uses external information to supplement a parametric model in order to improve the performance of language models in knowledge-intensive tasks.
 Retrieval augmented generation is a paradigm that uses an attention mechanism to retrieve relevant information from external sources to improve the quality of generated text.
 Retrieval augmented generation uses skeletons and template

In [98]:
print(result['result'])

 

• Sparse-vector Retrieval
• Dense-vector Retrieval
• Task-specific Retrieval
• Retrieval Memory
• Data Augmentation
• Attention Mechanism
• Skeleton & Templates
• Information Retrieval


In [66]:
query = 'Describe retrieval augmented generation'
result = qa({"query": query})
result['result']

" Retrieval augmented generation is a process where a language model (LM) is prompted to generate retrieval queries when necessary while generating an answer using retrieval-encouraging instructions, or by directly using the LM's generation as search queries. It is a simple and generic retrieval-augmented LM that actively decides when and what to retrieve throughout the generation process, and is applicable to a variety of long-form generation tasks."

In [72]:
query = 'Describe text embeddings in simple language'
result = qa({"query": query})
result['result']

' Text embeddings are dense vectors that represent a sequence of text tokens. They are obtained by obtaining the hidden representation of each token from the last encoding layer of a text-to-text Transformer model and then performing mean-pooling over them to get a single dense vector.'

In [70]:
query = 'Describe vector stores'
result = qa({"query": query})
result['result']

' Vector stores refer to the low-dimensional dense vectors that are created using BERT-based encoders and used to measure the similarity between two pieces of text. They are used in dense-vector retrieval methods to measure semantic relevance rather than lexical overlap.'

In [86]:
[r.metadata for r in result['source_documents']]

[{'seq_num': 5,
  'source': '/Users/md/Downloads/ws/md-similacra/data/arxiv/clean_txt/gpt_pp01_2305.06983v1.json'},
 {'seq_num': 12,
  'source': '/Users/md/Downloads/ws/md-similacra/data/arxiv/clean_txt/gpt_pp01_2202.01110v2.json'},
 {'seq_num': 2,
  'source': '/Users/md/Downloads/ws/md-similacra/data/arxiv/clean_txt/gpt_pp01_2202.01110v2.json'},
 {'seq_num': 3,
  'source': '/Users/md/Downloads/ws/md-similacra/data/arxiv/clean_txt/gpt_pp01_2305.06983v1.json'}]

In [103]:
result

{'query': 'in bullet points, provide examples of retrieval augmented generation setups',
 'result': ' \n\n• Sparse-vector Retrieval\n• Dense-vector Retrieval\n• Task-specific Retrieval\n• Retrieval Memory\n• Data Augmentation\n• Attention Mechanism\n• Skeleton & Templates\n• Information Retrieval',
 'source_documents': [Document(page_content='In this section, we formally deﬁne single-time retrieval-augmented generation and propose the framework of active retrieval augmented generation that decides when and what to retrieve throughout the generation.', metadata={'seq_num': 5, 'source': '/Users/md/Downloads/ws/md-similacra/data/arxiv/clean_txt/gpt_pp01_2305.06983v1.json'}),
  Document(page_content='In this paper, we surveyed recent approaches for retrieval-augmented text generation. We reviewed and summarized the development of different components of retrieval-augmented text generation including retrieval metrics, retrieval sources, and integration paradigms. We gave in-depth discussion

In [36]:
len(result['source_documents'])

4

In [91]:
path_file = './data/arxiv/tar/2009.08553v4.tax.gz'
path_file = './data/arxiv/tar/2104.07713v2.tax.gz'
path_folder = './data/arxiv/tar/'


In [94]:
import tarfile

tar = tarfile.open(path_file, "r:gz")
for member in tar.getmembers():
    if member.name.endswith('.tex') and ('/' not in member.name):
        print(member)
        ltx = tar.extractfile(member).readlines()
        ltx_lines = [l.decode('utf-8') for l in ltx]
        ltx_lines = [l for l in ltx_lines if not str(l).startswith('%')]

<TarInfo 'bare_jrnl_compsoc.tex' at 0x7fe28932b4c0>


In [97]:
section_names = [t for t in ltx_lines if '\\section' in t]
sel_sections = ['Introduction',
'Related Work',
'Conclusion']

sel_section_names = [s for s in section_names if any([sec in s for sec in sel_sections])]
[remove_sometext(s) for s in sel_section_names]

['[[Introduction}[sec:introduction}}\n', '[Related Work}\n', '[Conclusion}\n']

In [96]:
sections = []
for i in range(len(section_names)-1):
    sec_start = ltx_lines.index(section_names[i])
    sec_end = ltx_lines.index(section_names[i+1])
    section_raw_text = ltx_lines[sec_start+1:sec_end]
    section_text =' '.join([remove_sometext(s).replace('}',']') for s in section_raw_text if s!='\n'])
    sections.append(section_text)

In [80]:
import re

def remove_sometext(string):
    pattern = r'\\[a-zA-Z0-9_]+{'
    cleaned_string = re.sub(pattern, '[', string)
    return cleaned_string

'[sec:related_work]\n [Conventional Query Expansion]\n \\ours shares some merits with query expansion (QE) methods based on pseudo relevance feedback [rocchio1971relevance,abdul2004umass,lv2010positional] in that they both expand the queries with relevant contexts (terms) without the use of external supervision. \\ours is superior as it expands the queries with knowledge stored in the PLMs rather than the retrieved passages and its expanded terms are learned through text generation.\n [Recent Query Reformulation]\n There are recent or concurrent studies [nogueira-cho-2017-task,zaiem2019sequence,yu2020few,vakulenko2020question,lin2020query] that reformulate queries with generation models for other retrieval tasks.\n However, these studies are not easily applicable or efficient enough for OpenQA because: (1) They require external resources such as paraphrase data~[zaiem2019sequence], search sessions~[yu2020few], or conversational contexts~[lin2020query,vakulenko2020question] to form the 

In [17]:
import weaviate

In [18]:
client = weaviate.Client(WEAVIATE_URL)

In [33]:
result

{'query': 'Describe Captain Claw',
 'result': " Captain Claw is a notorious pirate who has sailed the seven seas and has a reputation for being ruthless and cunning. He is always looking for treasure and will do whatever it takes to get it. He is Oliver's rival and the antagonist of the story, and is constantly trying to outsmart him.",
 'source_documents': [Document(page_content="Name: Captain Claw (10), Male Role: Captain Claw is Oliver's rival and the antagonist of the story. He tries to steal Oliver's treasure and is constantly trying to outsmart him., Bio: Captain Claw is a notorious pirate who has sailed the seven seas and has a reputation for being ruthless and cunning. He is always looking for treasure and will do whatever it takes to get it.\n\n\nThis is the story so far Oliver and his animal crew set sail on a cardboard ship in search of treasure.\nOliver and his crew face a treacherous storm at sea.\n.\n\n\nThis section is titled: The Tricky Map, with the description: Oliver