In [3]:
from haystack import *
from haystack.document_stores import FAISSDocumentStore
from haystack.schema import Document
from haystack.nodes import DensePassageRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import print_answers

In [4]:
from bs4 import BeautifulSoup
import urllib.request as urllib2
from googlesearch import search
from bs4.element import Comment
import urllib.request

In [5]:
def create_documents(urls):
    
    documents = []

    for j, url in enumerate(urls):        
        try:
            hdr = {'User-Agent': 'Mozilla/5.0'}
            req = urllib2.Request(url, headers = hdr)
            page = urllib2.urlopen(req)
            soup = BeautifulSoup(page, "html.parser")
            
            for i, paragraph in enumerate(soup.findAll("p")):
                p = paragraph.text.split()
                if len(p) == 0:
                    continue
                documents.append(Document(content = " ".join(p)))   
        except:
            "crap"
        
    return documents

def create_docs(directory):
    import os
    
    docs = []

    document_names = os.listdir(f"./{directory}")
    for doc in document_names:
        text = open(f"{directory}/{doc}",  encoding = 'utf8').read()
        docs.append(Document(content = text))
        
    return docs

In [6]:
query = "What is machine learning?"
urls = [url for url in search(query, tld = "co.in", num = 50, stop = 40, pause = 0.5)]   
urls = [url for url in urls if "html" not in url and "wikipedia" not in url]

In [7]:
docs = create_documents(urls)

In [8]:
document_store = FAISSDocumentStore(vector_dim=128, faiss_index_factory_str="Flat")
document_store.delete_documents()
document_store.write_documents(docs)

In [9]:
from haystack.retriever.dense import EmbeddingRetriever

retriever = EmbeddingRetriever(document_store=document_store,
                               embedding_model="yjernite/retribert-base-uncased",
                               model_format="retribert")

document_store.update_embeddings(retriever)

Some weights of RetriBertModel were not initialized from the model checkpoint at yjernite/retribert-base-uncased and are newly initialized: ['bert_query.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Updating Embedding:   0%|                                                                  | 0/1343 [00:00<?, ? docs/s]

Creating Embeddings:   0%|          | 0/42 [00:00<?, ? Batches/s]

Documents Processed: 10000 docs [43:53,  3.80 docs/s]                                                                  


In [60]:
from haystack.generator.transformers import Seq2SeqGenerator
from haystack.pipeline import GenerativeQAPipeline

generator0 = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5", min_length = 0)
pipe0 = GenerativeQAPipeline(generator0, retriever)

generator50 = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5", min_length = 50)
pipe50 = GenerativeQAPipeline(generator50, retriever)

generator100 = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5", min_length = 100)
pipe100 = GenerativeQAPipeline(generator100, retriever)


q = "What metrics do we use to evaluate machine learning models?"

ret0 = pipe0.run(query = q, params = {"Retriever": {"top_k": 5}})
ret50 = pipe50.run(query = q, params = {"Retriever": {"top_k": 5}})
ret100 = pipe100.run(query = q, params = {"Retriever": {"top_k": 5}})

print("\n\n------------Min Length: 0------------\n\n")
print(ret0["answers"][0])

print("\n\n------------Min Length: 50------------\n\n")
print(ret50["answers"][0])

print("\n\n------------Min Length: 100------------\n\n")
print(ret100["answers"][0])


print("\n\n------------Extracted Documents------------\n\n")
print("\n\n\n".join([c.content for c in ret0["documents"]]))



------------Min Length: 0------------


 It depends on what you mean by "machine learning model". There are a lot of different ways to evaluate a machine learning model, but the most common way is to look at how well the model performs in a real world situation. For example, if you have a model that predicts how many ice creams will be sold based on the temperature, and you give it a set of training data, it will give you a very good prediction. If you give the model a new set of data, and it gives you a much better prediction, then you can say that the model is better than the training data.


------------Min Length: 50------------


 It depends on what you mean by "machine learning model". There are a lot of different ways to evaluate a machine learning model, but the most common way is to look at how well the model performs in a real world situation. For example, if you have a model that predicts how many ice creams will be sold based on the temperature, and you give it a set of t