In [22]:
from pprint import pprint
from tqdm import tqdm
from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import (
    QuestionGenerationPipeline,
    RetrieverQuestionGenerationPipeline,
    QuestionAnswerGenerationPipeline,
)
from haystack.utils import launch_es
from haystack.document_stores import FAISSDocumentStore
from haystack.schema import Document
import nltk
from rouge import Rouge 
import numpy as np
from bs4 import BeautifulSoup
import urllib.request as urllib2
import wikipediaapi
import time
import pandas as pd

In [23]:
keywords = pd.read_csv("Keywords-Springer-83K-20210405.csv", header = None)
keywords = keywords[0].to_numpy().tolist()[:300]

In [24]:
wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.HTML)
question_generator = QuestionGenerator()
question_generation_pipeline = QuestionGenerationPipeline(question_generator)
reader = FARMReader("deepset/roberta-base-squad2")
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)

Some weights of the model checkpoint at deepset/roberta-base-squad2 were not used when initializing RobertaModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
ML Logging is turned off. No parameters, metrics or artifacts will be logged to 

In [25]:
data = []
failed_topics = []
headings = ["title", "section", "full_text", "summ_text", "questions", "answers"]

In [27]:
%%capture
start_time = time.time()
for topic in keywords:
    try:
        page = wiki_wiki.page(topic)

        content = [("Summary", page.summary)]
        for section in page.sections:
            if section.title == "See also":
                break
            content.append((section.title, section.text))

        docs_data = []
        for title, text in content:
            soup = BeautifulSoup(text)
            for p in soup.find_all("p"):
                sentences = nltk.sent_tokenize(p.text)
                if len(sentences) < 2:
                    continue
                docs_data.append((title, p.text, " ".join(sentences[:2])))

        for doc in docs_data:
            result = qag_pipeline.run(documents = [Document(content = doc[2])])

            questions = [ret["query"] for ret in result["results"]]
            answers = [ret["answers"][0].answer for ret in result["results"]]

            data.append([topic, doc[0], doc[1], doc[2], questions, answers])
    except:
        failed_topics.append(topic)

In [28]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 30970.05046772957 seconds ---


In [29]:
pd.DataFrame(data).to_csv("top300.csv", header = headings)

In [33]:
failed_topics

['feature selection',
 'independent component analysis',
 'image segmentation',
 'multi-objective optimization',
 'multiobjective optimization',
 'linear discriminant analysis',
 'homomorphic encryption']