In [1]:
from haystack import *
from haystack.document_stores import FAISSDocumentStore
from haystack.schema import Document
from haystack.nodes import DensePassageRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import print_answers



In [2]:
from bs4 import BeautifulSoup
import urllib.request as urllib2
from googlesearch import search
from bs4.element import Comment
import urllib.request

In [3]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

def create_documents(urls):
    
    documents = []

    for j, url in enumerate(urls):        
        try:
            html = urllib.request.urlopen(url).read()
            soup = BeautifulSoup(html, 'html.parser')
            texts = soup.findAll(text=True)
            visible_texts = filter(tag_visible, texts)  
            text = u" ".join(t.strip() for t in visible_texts)
            documents.append(Document(content = text, meta = {"vector_id" : j}))
        except:
            "crap"
        
    return documents

def create_docs(directory):
    import os
    
    docs = []

    document_names = os.listdir(f"./{directory}")
    for doc in document_names:
        text = open(f"{directory}/{doc}",  encoding = 'utf8').read()
        docs.append(Document(content = text))
        
    return docs

In [4]:
query = "What is machine learning?"
urls = [url for url in search(query, tld = "co.in", num = 50, stop = 40, pause = 0.5)]   
urls = [url for url in urls if "html" not in url and "wikipedia" not in url]

In [5]:
docs = create_documents(urls)

https://www.ibm.com/cloud/learn/machine-learning
https://www.ibm.com/za-en/cloud/learn/machine-learning
https://www.ibm.com/se-en/cloud/learn/machine-learning
https://www.ibm.com/hk-en/cloud/learn/machine-learning
https://www.ibm.com/cloud/blog/ai-vs-machine-learning-vs-deep-learning-vs-neural-networks
https://www.britannica.com/technology/machine-learning
https://www.expert.ai/blog/machine-learning-definition/
https://www.forbes.com/sites/ryancraig/2022/01/21/what-learning-can-learn-from-machine-learning/
https://news.mit.edu/2022/test-machine-learning-models-work-0118
https://scitechdaily.com/physics-based-engineering-and-the-machine-learning-black-box-problem/
https://mitsloan.mit.edu/ideas-made-to-matter/machine-learning-explained
https://www.coursera.org/lecture/machine-learning/what-is-machine-learning-Ujm7v
https://royalsociety.org/topics-policy/projects/machine-learning/videos-and-background-information/
https://www.geeksforgeeks.org/machine-learning/
https://www.techtarget.com

In [6]:
document_store = FAISSDocumentStore(vector_dim=128, faiss_index_factory_str="Flat")
document_store.delete_documents()
document_store.write_documents(docs)

In [7]:
from haystack.retriever.dense import EmbeddingRetriever

retriever = EmbeddingRetriever(document_store=document_store,
                               embedding_model="yjernite/retribert-base-uncased",
                               model_format="retribert")

document_store.update_embeddings(retriever)

Some weights of RetriBertModel were not initialized from the model checkpoint at yjernite/retribert-base-uncased and are newly initialized: ['bert_query.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Documents Processed: 10000 docs [00:22, 434.95 docs/s]                                                                 


In [9]:
from haystack.generator.transformers import Seq2SeqGenerator
from haystack.pipeline import GenerativeQAPipeline

generator = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5", min_length = 100)
pipe = GenerativeQAPipeline(generator, retriever)

Object 'GenerativeQAPipeline' is imported through a deprecated path. Please check out the docs for the new import path.


In [14]:
ret = pipe.run(query = "What is the origin of machine learning?")

In [15]:
ret["answers"]

[' Machine learning is the process by which a computer can learn from experience. It\'s not the origin of machine learning, it\'s just the process of learning how to learn. There\'s no "origin" for machine learning in the sense that we don\'t know where it came from. Machine learning has been around for a long time, we just haven\'t figured out how to use it to improve our lives. The origin of the term "machine learning" is a bit of a misnomer. It doesn\'t mean that a computer is learning to do something, it just means that it is learning what to do.']