In [1]:
from haystack import *
from haystack.document_stores import FAISSDocumentStore
from haystack.schema import Document
from haystack.nodes import DensePassageRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import print_answers



In [2]:
import time
start_time = time.time()

In [3]:
def create_docs(directory):
    import os
    
    docs = []

    document_names = os.listdir(f"./{directory}")
    for doc in document_names:
        text = open(f"{directory}/{doc}",  encoding = 'utf8').read()
        docs.append(Document(content = text))
        
    return docs

In [4]:
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
docs = create_docs("documents")
document_store.write_documents(docs)

In [5]:
retriever = DensePassageRetriever(document_store=document_store,
                                query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                )

In [6]:
document_store.update_embeddings(retriever)

Updating Embedding:   0%|                                                                     | 0/6 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/16 [00:00<?, ? Docs/s]

Documents Processed: 10000 docs [00:04, 2489.55 docs/s]                                                                


In [7]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

Some weights of the model checkpoint at deepset/roberta-base-squad2 were not used when initializing RobertaModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to log params: Changing param values is not allowed. Param with key='pred

In [8]:
pipe = ExtractiveQAPipeline(reader, retriever)

In [9]:
def answer_question(question):
    prediction = pipe.run(query = question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
    print_answers(prediction, details="minimal")

In [10]:
answer_question("What are some applications of machine learning?")

Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.89s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.94s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.53s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.30s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.45s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.29s/ Batches]

[   {   'answer': 'tracking monetary frauds online',
        'context': 'proving its potential to make cyberspace a secure place '
                   'and tracking monetary frauds online is one of its '
                   'examples. For example: Paypal is using ML fo'},
    {   'answer': 'data mining, statistics, and modeling',
        'context': 'to make predictions about the future. Techniques such as '
                   'data mining, statistics, and modeling employ machine '
                   'learning and artificial intelligence to '},
    {   'answer': 'medical diagnosis, image processing, prediction, '
                  'classification, learning association, regression',
        'context': 'd industries. For example, medical diagnosis, image '
                   'processing, prediction, classification, learning '
                   'association, regression etc.\n'
                   '\n'
                   'The intelligent sys'},
    {   'answer': 'medical diagnosis, image proc




In [11]:
print(f"--- Execution took {(time.time() - start_time)} seconds ---")

--- Execution took 69.4523401260376 seconds ---
