In [1]:
from pprint import pprint
from tqdm import tqdm
from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import (
    QuestionGenerationPipeline,
    RetrieverQuestionGenerationPipeline,
    QuestionAnswerGenerationPipeline,
)
from haystack.utils import launch_es
from haystack.document_stores import FAISSDocumentStore
from haystack.schema import Document
import nltk
from rouge import Rouge 
import numpy as np
from bs4 import BeautifulSoup
import urllib.request as urllib2
import wikipediaapi
import time
import pandas as pd



In [2]:
keywords = pd.read_csv("Keywords-Springer-83K-20210405.csv", header = None)
keywords = keywords[0].to_numpy().tolist()[:500]

In [3]:
wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.HTML)
question_generator = QuestionGenerator()
question_generation_pipeline = QuestionGenerationPipeline(question_generator)
reader = FARMReader("deepset/roberta-base-squad2")
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)

Some weights of the model checkpoint at deepset/roberta-base-squad2 were not used when initializing RobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
ML Logging is turned off. No parameters, metrics or artifacts will be logged to 

In [49]:
data = []
failed_topics = []
headings = ["topic", "section", "full_text", "prefix", "suffix", "questions", "answers", "scores"]

In [50]:
%%capture
start_time = time.time()
for topic in keywords[:5]:
    try:
        page = wiki_wiki.page(topic)

        content = [("Summary", page.summary)]
        for section in page.sections:
            if section.title == "See also":
                break
            content.append((section.title, section.text))

        docs_data = []
        for title, text in content:
            soup = BeautifulSoup(text)
            for p in soup.find_all("p"):
                sentences = nltk.sent_tokenize(p.text)
                if len(sentences) <= 2:
                    continue
                # heading, full_text, prefix, suffix
                docs_data.append((title, p.text, " ".join(sentences[:2]), " ".join(sentences[2:])))

        for i, doc in enumerate(docs_data):
            print(i)
            result = qag_pipeline.run(documents = [Document(content = doc[3])])

            if len(result["results"][0]["answers"]) == 0:
                continue

            questions = [ret["query"] for ret in result["results"]]
            answers = [ret["answers"][0].answer for ret in result["results"]]
            scores = [ret["answers"][0].score for ret in result["results"]]

            # topic, section, full_text, prefix, suffix, questions, answers, scores
            data.append([topic, doc[0], doc[1], doc[2], doc[3], questions, answers, scores])

    except:
        failed_topics.append(topic)

In [51]:
print(time.time() - start_time)

594.190242767334


In [52]:
pd.DataFrame(data).to_csv("sentence_split_top5.csv", header = headings)