In [1]:
%%capture

from bs4 import BeautifulSoup
import urllib.request as urllib2
from googlesearch import search

from haystack import *
from haystack.document_stores import FAISSDocumentStore
from haystack.schema import Document
from haystack.nodes import DensePassageRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import print_answers

from IPython.core.display import display, HTML

In [2]:
topic = "Computer Security"
headings = ["Applications", "Types", "History", "Evaluation Metrics"]
questions = [f"What are the {heading} of {topic}?" for heading in headings]

In [3]:
def create_documents(urls):
    
    documents = []

    for url in urls:

        hdr = {'User-Agent': 'Mozilla/5.0'}
        req = urllib2.Request(url, headers = hdr)
        page = urllib2.urlopen(req)
        soup = BeautifulSoup(page, "html.parser")

        ps = []

        for i, paragraph in enumerate(soup.findAll("p")):
            p = paragraph.text.split()
            if len(p) == 0 or p[-1][-1] not in ".!,:;":
                continue
            ps.append(" ".join(p))
            
        text = "\n".join(ps)
        documents.append(Document(content = text))
        
    return documents

In [4]:
def answer_query(query, docs):
    
    document_store = FAISSDocumentStore(faiss_index_factory_str = "Flat")
    document_store.write_documents(docs)
    
    retriever = DensePassageRetriever(document_store = document_store,
                                query_embedding_model = "facebook/dpr-question_encoder-single-nq-base",
                                passage_embedding_model = "facebook/dpr-ctx_encoder-single-nq-base",
                                )
    
    document_store.update_embeddings(retriever)
    
    reader = FARMReader(model_name_or_path = "deepset/roberta-base-squad2")
    
    pipe = ExtractiveQAPipeline(reader, retriever)
    
    prediction = pipe.run(query = query, params = {"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
    
    return prediction

In [5]:
def synthesize_text(contexts, text):
    
    synth_text = ""

    for context in contexts:
        target = text.find(context)
        start, end = target, target

        while start >= 0:
            if text[start] == "." or text[start] == "\n":
                start += 1
                break
            start -= 1

        while end < len(text):
            if (text[end] == "." or text[start] == "\n") and (end - target) > len(context):
                end += 1
                break
            end += 1

        synth_text += text[start:end] + " "
        
    return synth_text

In [6]:
heading_texts = []
heading_answers = []

for heading in headings:

    query = f"What are {heading} of {topic}?"
    urls = [url for url in search(query, tld = "co.in", num = 5, stop = 5, pause = 2)]    
    docs = create_documents(urls)
    
    prediction = answer_query(query, docs)
    
    contexts = [answer.context for answer in prediction["answers"]]
    answers = [answer.answer for answer in prediction["answers"]]
    text = "".join([pred.content for pred in prediction["documents"]])
    
    synth_text = synthesize_text(contexts, text)
    
    heading_texts.append(synth_text)
    heading_answers.append(answers)

Updating Embedding:   0%|                                                                    | 0/44 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/48 [00:00<?, ? Docs/s]

Documents Processed: 10000 docs [00:32, 305.76 docs/s]                                                                 
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.21s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:27<00:00, 27.99s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.10s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.54s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:22<00:00, 22.43s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.16s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.63s/ Batches]
Inferencing Samples: 100%|██████████████

Create embeddings:   0%|          | 0/48 [00:00<?, ? Docs/s]

Documents Processed: 10000 docs [00:35, 283.92 docs/s]                                                                 
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.76s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.06s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:29<00:00, 29.50s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.12s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.42s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.73s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.26s/ Batches]
Inferencing Samples: 100%|██████████████

Create embeddings:   0%|          | 0/64 [00:00<?, ? Docs/s]

Documents Processed: 10000 docs [00:38, 256.73 docs/s]                                                                 
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:29<00:00, 29.70s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:14<00:00, 14.72s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.76s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:17<00:00, 17.09s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.27s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:22<00:00, 23.00s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.03s/ Batches]
Inferencing Samples: 100%|██████████████

Create embeddings:   0%|          | 0/64 [00:00<?, ? Docs/s]

Documents Processed: 10000 docs [00:40, 247.36 docs/s]                                                                 
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.20s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.07s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.64s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.03s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.98s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.45s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.83s/ Batches]
Inferencing Samples: 100%|██████████████

In [7]:
wiki_sections = [f"<h3>{questions[i]}</h3>\n<ul>{''.join([f'<li>{answer.lower()}</li>' for answer in heading_answers[i]])}</ul>\n<p>{heading_texts[i]}</p>" for i in range(len(headings))]
body_text = "<br/>".join(wiki_sections)
wiki_text = f"<u><h1>{topic}</h1></u><br />{body_text}"

display(HTML(wiki_text))