In [91]:
from pprint import pprint
from tqdm import tqdm
from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import (
    QuestionGenerationPipeline,
    RetrieverQuestionGenerationPipeline,
    QuestionAnswerGenerationPipeline,
)
from haystack.utils import launch_es
from haystack.document_stores import FAISSDocumentStore
from haystack.schema import Document
import nltk
from rouge import Rouge 
import numpy as np
from bs4 import BeautifulSoup
import urllib.request as urllib2
import wikipediaapi

In [92]:
topic = "Machine Learning"

In [93]:
wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.HTML)
page = wiki_wiki.page('Machine Learning')

In [94]:
content = [("Summary", page.summary)]
for section in page.sections:
    if section.title == "See also":
        break
    content.append((section.title, section.text))

In [112]:
docs_data = []
for title, text in content:
    soup = BeautifulSoup(text)
    for p in soup.find_all("p"):
        sentences = nltk.sent_tokenize(p.text)
        if len(sentences) < 2:
            continue
        docs_data.append((title, p.text, " ".join(sentences[:2])))

In [113]:
question_generator = QuestionGenerator()

In [114]:
question_generation_pipeline = QuestionGenerationPipeline(question_generator)

In [115]:
reader = FARMReader("deepset/roberta-base-squad2")
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
result = qag_pipeline.run(documents = [Document(content = docs_data[0][2])])

Some weights of the model checkpoint at deepset/roberta-base-squad2 were not used when initializing RobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
ML Logging is turned off. No parameters, metrics or artifacts will be logged to 

In [116]:
questions = [ret["query"] for ret in result["results"]]
answers = [ret["answers"][0].answer for ret in result["results"]]

In [117]:
questions

[' What is the study of computer algorithms that can improve automatically through experience and by the use of data?',
 ' Machine learning is seen as what?']

In [118]:
answers

['Machine learning (ML)', 'artificial intelligence']

In [119]:
data_point = ["Machine Learning", doc1, questions, answers]

In [120]:
docs_data

[('Summary',
  'Machine learning (ML) is the study of computer algorithms that can improve automatically through experience and by the use of data. It is seen as a part of artificial intelligence. Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.',
  'Machine learning (ML) is the study of computer algorithms that can improve automatically through experience and by the use of data. It is seen as a part of artificial intelligence.'),
 ('Summary',
  'A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers; but not all machine learning is statistic