In [1]:
!pip install --upgrade --quiet pip
!pip install --quiet haystack-ai
print('pip install haystack complete.')

pip install haystack complete.


Let's get the demo from here: https://docs.haystack.deepset.ai/docs/get_started working with a HuggingFace pipeline instead of OpenAI. 

Let's try to get some data from the Web as text files.

In [2]:
NAMES = ['gems.01', 'gems.2', 'gems.3', 'gems.4']
URLS = ['http://www.textfiles.com/science/{}'.format(name) for name in NAMES]
 
print('we have {} URLs'.format(len(URLS)))

we have 4 URLs


We have filenames; now let's go get the files and put them in our working directory.

In [3]:
# https://stackoverflow.com/a/56951135
import os
import requests


def download(url: str, dest_folder: str):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)  # create folder if it does not exist

    filename = url.split('/')[-1].replace(' ', '_')  # be careful with file names
    file_path = os.path.join(dest_folder, filename)

    r = requests.get(url, stream=True)
    if r.ok:
        print('saving to', os.path.abspath(file_path))
        with open(file=file_path, mode='wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 8):
                if chunk:
                    f.write(chunk)
                    f.flush()
                    os.fsync(f.fileno())
    else:  # HTTP status code 4XX/5XX
        print('Download failed: status code {}\n{}'.format(r.status_code, r.text))

DESTINATION = '/kaggle/working/files/'

for url in URLS:
    download(url=url, dest_folder=DESTINATION)

saving to /kaggle/working/files/gems.01
saving to /kaggle/working/files/gems.2
saving to /kaggle/working/files/gems.3
saving to /kaggle/working/files/gems.4


And let's rename our files so we can refer to them with a nice simple glob below.

In [4]:
import os
from glob import glob

for source_file in glob(DESTINATION + '*'):
    if not source_file.endswith('.txt'):
        os.rename(source_file, source_file + '.txt')

Let's build a document store and populate it with our data. Documents need to be broken up into chunks, and chunking is currently a guessing game. Let's start with one-sentence chunks, and use the NLTK sentences tokenizer.

In [5]:
import os
import arrow
from glob import glob
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack import Document
from nltk.tokenize import sent_tokenize

def read(file_name: str) -> list:
    print('reading {}'.format(os.path.basename(file_name)))
    with open(file=file_name, mode='r', encoding='utf-8', errors='replace' ) as input_fp:
        data = input_fp.read()
        lines = data.split('\n')
        lines = [line for line in lines if len(line) > 1]
        return sent_tokenize(' '.join(lines))

time_start = arrow.now()
document_store = InMemoryDocumentStore()
sentences = [read(file_name=input_file) for input_file in glob('/kaggle/working/files/*.txt')]
# we have a list of lists and we need a flat list
documents = list(set([document for sublist in sentences for document in sublist]))

document_store.write_documents([Document(document) for document in documents])

print('built/populated document store in {}'.format(arrow.now() - time_start))


reading gems.4.txt
reading gems.2.txt
reading gems.01.txt
reading gems.3.txt
built/populated document store in 0:00:00.066008


In [6]:
import arrow
from haystack import Pipeline
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import HuggingFaceLocalGenerator
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever

# Build a RAG pipeline
TEMPLATE = '''
Given these documents, answer the question.
Documents:
{% for doc in documents %}
    {{ doc.content }}
{% endfor %}
Question: {{question}}
Answer:
'''

time_start = arrow.now()
pipeline = Pipeline()
pipeline.add_component('retriever', InMemoryBM25Retriever(document_store=document_store))
pipeline.add_component('prompt_builder', PromptBuilder(template=TEMPLATE))
pipeline.add_component('llm', HuggingFaceLocalGenerator())
pipeline.connect('retriever', 'prompt_builder.documents')
pipeline.connect('prompt_builder', 'llm')

print('built pipeline in {}'.format(arrow.now() - time_start))

2024-03-22 19:02:39.793786: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-22 19:02:39.793921: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-22 19:02:39.948001: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


built pipeline in 0:00:00.331176


In [7]:
import json
# Ask a question
question = 'What do Teshnans do?'
results = pipeline.run(
    {
        'retriever': {'query': question},
        'prompt_builder': {'question': question},
    }
)

print(json.dumps(results, indent=4))

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Ranking by BM25...: 0 docs [00:00, ? docs/s]



{
    "llm": {
        "replies": [
            "astronomer"
        ]
    }
}
