In [9]:
from pprint import pprint
from tqdm import tqdm
from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import (
    QuestionGenerationPipeline,
    RetrieverQuestionGenerationPipeline,
    QuestionAnswerGenerationPipeline,
)
from haystack.utils import launch_es
from haystack.document_stores import FAISSDocumentStore
from haystack.schema import Document
import nltk
from rouge import Rouge 
import numpy as np
from bs4 import BeautifulSoup
import urllib.request as urllib2
import wikipediaapi
import time
import pandas as pd

In [3]:
wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.HTML)
question_generator = QuestionGenerator()
question_generation_pipeline = QuestionGenerationPipeline(question_generator)
reader = FARMReader("deepset/roberta-base-squad2")
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)

Some weights of the model checkpoint at deepset/roberta-base-squad2 were not used when initializing RobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
ML Logging is turned off. No parameters, metrics or artifacts will be logged to 

In [4]:
topics = ["Machine Learning", "Data Mining"]

In [28]:
data = []
headings = ["title", "section", "full_text", "summ_text", "questions", "answers"]

In [7]:
start_time = time.time()

for topic in topics:
    page = wiki_wiki.page('Machine Learning')
    
    content = [("Summary", page.summary)]
    for section in page.sections:
        if section.title == "See also":
            break
        content.append((section.title, section.text))
        
    docs_data = []
    for title, text in content:
        soup = BeautifulSoup(text)
        for p in soup.find_all("p"):
            sentences = nltk.sent_tokenize(p.text)
            if len(sentences) < 2:
                continue
            docs_data.append((title, p.text, " ".join(sentences[:2])))
            
    for doc in docs_data:
        result = qag_pipeline.run(documents = [Document(content = doc[2])])
        
        questions = [ret["query"] for ret in result["results"]]
        answers = [ret["answers"][0].answer for ret in result["results"]]
        
        data.append([topic, doc[0], doc[1], doc[2], questions, answers])
        
print("--- %s seconds ---" % (time.time() - start_time))

Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.03 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.02 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.05 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.10s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.05 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.04 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.12 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.09 Batches/s]
Inferencing Samples: 100%|██████████████

Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.06s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.04s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.06s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.05s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.02s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.00s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.00 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.04s/ Batches]
Inferencing Samples: 100%|██████████████

--- 450.7380871772766 seconds ---





In [29]:
pd.DataFrame(data).to_csv("test1.csv", header = headings)