In [3]:
from haystack import *
from haystack.document_stores import FAISSDocumentStore
from haystack.schema import Document
from haystack.nodes import DensePassageRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import print_answers

from bs4 import BeautifulSoup
import urllib.request as urllib2
from googlesearch import search

In [4]:
def create_docs(urls):
    
    documents = []
    full_text = ""

    for url in urls:

        hdr = {'User-Agent': 'Mozilla/5.0'}
        req = urllib2.Request(url, headers = hdr)
        page = urllib2.urlopen(req)
        soup = BeautifulSoup(page, "html.parser")

        ps = []

        for i, paragraph in enumerate(soup.findAll("p")):
            p = paragraph.text.split()
            if len(p) == 0 or p[-1][-1] not in ".!,:;":
                continue
            ps.append(" ".join(p))
            
        text = "\n".join(ps)
        
        full_text += text
        documents.append(Document(content = text))
        
    return documents, full_text

In [19]:
query = "What are metrics of machine learning?"
urls = [url for url in search(query, tld="co.in", num = 5, stop = 5, pause=2)]

In [22]:
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
docs, text = create_docs(urls)
document_store.write_documents(docs)

In [23]:
retriever = DensePassageRetriever(document_store=document_store,
                                query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                )

In [24]:
document_store.update_embeddings(retriever)

Updating Embedding:   0%|                                                                    | 0/23 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/32 [00:00<?, ? Docs/s]

Documents Processed: 10000 docs [00:15, 664.11 docs/s]                                                                 


In [25]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

Some weights of the model checkpoint at deepset/roberta-base-squad2 were not used when initializing RobertaModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
ML Logging is turned off. No parameters, metrics or artifacts will be logged to 

In [26]:
pipe = ExtractiveQAPipeline(reader, retriever)

In [27]:
prediction = pipe.run(query = query, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})

Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.23s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.79s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.37s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.70s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.08s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.67s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.04s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.30s/ Batches]
Inferencing Samples: 100%|██████████████

In [28]:
contexts = [answer.context for answer in prediction["answers"]]
contexts

['rning is a domain of computer science with its base in computational mathematics and statistics. The machine is shown a ton of data and it learns the ',
 'l may give you satisfying results when evaluated using a metric say accuracy_score but may give poor results when evaluated against other metrics such',
 'ing problems, with the most talked about algorithms being k-means and hierarchical clustering, though other algorithms like Hidden Markov models, Self',
 'revious successful applicants.[111][112] Responsible collection of data and documentation of algorithmic rules used by a system thus is a critical par',
 'scenario to play the game Mario.\nNow that we’ve discussed the three different categories of machine learning, it’s important to note that a lot of tim']

In [29]:
answers = [answer.answer for answer in prediction["answers"]]
answers

['computational mathematics and statistics',
 'accuracy_score',
 'k-means and hierarchical clustering',
 'Responsible collection of data and documentation of algorithmic rules',
 'three different categories']

In [149]:
print_answers(prediction, details="minimal")

[   {   'answer': 'tracking monetary frauds online',
        'context': 'proving its potential to make cyberspace a secure place '
                   'and tracking monetary frauds online is one of its '
                   'examples. For example: Paypal is using ML fo'},
    {   'answer': 'data mining, statistics, and modeling',
        'context': 'to make predictions about the future. Techniques such as '
                   'data mining, statistics, and modeling employ machine '
                   'learning and artificial intelligence to '},
    {   'answer': 'medical diagnosis, image processing, prediction, '
                  'classification, learning association, regression',
        'context': 'd industries. For example, medical diagnosis, image '
                   'processing, prediction, classification, learning '
                   'association, regression etc.\n'
                   '\n'
                   'The intelligent sys'},
    {   'answer': 'tracking when you last walked

In [30]:
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = text.replace("etc.", "etc")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    
    return sentences

In [50]:
synth_text = ""
text = "".join([p.content for p in prediction["documents"]])

for context in contexts:
    target = text.find(context)
    print(target)
    start, end = target, target

    while start >= 0:
        if text[start] == "." or text[start] == "\n":
            start += 1
            break
        start -= 1

    while end < len(text):
        if (text[end] == "." or text[start] == "\n") and (end - target) > len(context):
            end += 1
            break
        end += 1

    synth_text += text[start:end] + " "

20791
59100
23363
52119
77590


In [51]:
from IPython.core.display import display, HTML
display(HTML(f"<h3>{query}</h3>\n<ul>{''.join([f'<li>{answer}</li>' for answer in answers])}</ul>\n<p>{synth_text}</p>"))