In [1]:
from haystack import *
from haystack.document_stores import FAISSDocumentStore
from haystack.schema import Document
from haystack.nodes import DensePassageRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import print_answers



In [2]:
from bs4 import BeautifulSoup
import urllib.request as urllib2
from googlesearch import search
from bs4.element import Comment
import urllib.request

In [3]:
def create_documents(urls):
    
    documents = []

    for j, url in enumerate(urls):        
        try:
            hdr = {'User-Agent': 'Mozilla/5.0'}
            req = urllib2.Request(url, headers = hdr)
            page = urllib2.urlopen(req)
            soup = BeautifulSoup(page, "html.parser")
            
            for i, paragraph in enumerate(soup.findAll("p")):
                p = paragraph.text.split()
                if len(p) == 0:
                    continue
                documents.append(Document(content = " ".join(p)))   
        except:
            "crap"
        
    return documents

def create_docs(directory):
    import os
    
    docs = []

    document_names = os.listdir(f"./{directory}")
    for doc in document_names:
        text = open(f"{directory}/{doc}",  encoding = 'utf8').read()
        docs.append(Document(content = text))
        
    return docs

In [4]:
queries = ["What is Machine Learning?", "What is the History of Machine Learning?", "What is the Theory of Machine Learning?", "What are the Approaches of Machine Learning?", "What are the Types of Machine Learning Models?", "What are Real-world Applications of Machine Learning?", "What are the Limitations of Machine Learning?", "What Metrics do we use to Evaluate Machine Learning Models?"]
urls = [[url for url in search(query, tld = "co.in", num = 15, stop = 10, pause = 0.5)] for query in queries]
urls = [item for sublist in urls for item in sublist]
urls = [url for url in urls if "html" not in url and "wikipedia" not in url]

In [5]:
docs = create_documents(urls)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [6]:
document_store = FAISSDocumentStore(vector_dim=128, faiss_index_factory_str="Flat")
document_store.delete_documents()
document_store.write_documents(docs)

In [7]:
from haystack.retriever.dense import EmbeddingRetriever

retriever = EmbeddingRetriever(document_store=document_store,
                               embedding_model="yjernite/retribert-base-uncased",
                               model_format="retribert")

document_store.update_embeddings(retriever)

Some weights of RetriBertModel were not initialized from the model checkpoint at yjernite/retribert-base-uncased and are newly initialized: ['bert_query.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Updating Embedding:   0%|                                                                  | 0/3101 [00:00<?, ? docs/s]

Creating Embeddings:   0%|          | 0/97 [00:00<?, ? Batches/s]

Documents Processed: 10000 docs [58:30,  2.85 docs/s]                                                                  


In [12]:
import numpy as np
np.array([0.45, 0.16, 0.15, 0.24]) @ np.array([[0, 1/3, 0, 2/3], [3/4, 0, 1/4, 0], [0, 2/3, 0, 1/3], [5/6, 0, 1/6, 0]])

array([0.32, 0.25, 0.08, 0.35])

In [15]:
from haystack.generator.transformers import Seq2SeqGenerator
from haystack.pipeline import GenerativeQAPipeline

generator0 = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5", min_length = 0)
pipe0 = GenerativeQAPipeline(generator0, retriever)

generator50 = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5", min_length = 50)
pipe50 = GenerativeQAPipeline(generator50, retriever)

generator100 = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5", min_length = 100)
pipe100 = GenerativeQAPipeline(generator100, retriever)


q = "What are the limitations of machine learning?"

ret0 = pipe0.run(query = q, params = {"Retriever": {"top_k": 5}})
ret50 = pipe50.run(query = q, params = {"Retriever": {"top_k": 5}})
ret100 = pipe100.run(query = q, params = {"Retriever": {"top_k": 5}})

print("\n\n------------Min Length: 0------------\n\n")
print(ret0["answers"][0])

print("\n\n------------Min Length: 50------------\n\n")
print(ret50["answers"][0])

print("\n\n------------Min Length: 100------------\n\n")
print(ret100["answers"][0])


print("\n\n------------Extracted Documents------------\n\n")
print("\n\n\n".join([c.content for c in ret0["documents"]]))



------------Min Length: 0------------


 The limitations of machine learning aren't limited by the limitations of human learning. They are limited by how well we can teach machines how to learn.


------------Min Length: 50------------


 The limitations of machine learning aren't limited by the limitations of human learning. They are limited by how well you can use machine learning to solve a problem. Machine learning is very good at solving problems, but it is very bad at solving *real* problems. For example, if you want to teach a computer how to recognize a dog, you can't just teach it to recognize dogs, you have to teach it how to distinguish a dog from a dog.


------------Min Length: 100------------


 The limitations of machine learning aren't limited by the limitations of human learning. They are limited by how well you can use machine learning to solve a problem. Machine learning is very good at solving problems, but it is very bad at solving *real* problems. For example, i