In [1]:

import collections
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
pre_loaded = {}

from compute_answer import get_answer

def model_loader(kind, name, load):
    name = kind + ":" + name
    if name not in pre_loaded:
      pre_loaded[name] = load()
    return pre_loaded[name]
    #
    
from transformers import AutoTokenizer, AutoModelWithLMHead

checkpoint = "mrm8488/t5-base-finetuned-question-generation-ap"

model_question_generation = model_loader(
  'model',
  checkpoint,
  lambda : AutoModelWithLMHead.from_pretrained(checkpoint)
)
tokenizer_question_generation = model_loader(
  'tokenizer',
  checkpoint,
  lambda : AutoTokenizer.from_pretrained(checkpoint)
)

model_loader(
    'nltk',
    "punkt",
    lambda : nltk.download('punkt')
)
# global variables:
# NER_LABELS = ('GPE', 'WORK_OF_ART', 'PERSON', 'NORP', 'EVENT', 'LOC', 'ORG','PRODUCT','LANGUAGE','QUANTITY')
NER_LABELS = ("NP",)
MIN_TOKENS = 10 
MAX_TOKENS = 45

# ------------------------ MAIN ------------------------ #

def generate_mcq(sections, num_questions=10, sparse_mode=True, use_cache=False):
    sentences = []
    for section in sections.values():
        for sentence in section["sentences"]:
            if not sentence.get('tokens') or not (MIN_TOKENS <= sentence['tokens'] <= MAX_TOKENS):
                continue
            sentences.append(sentence)
    sentences.sort(key=lambda x:x["rank"], reverse=True)
    cache = []
    output = []
    for sentence in sentences:
        _output, cache = generate_mcq_one(
            sentence,
            cache,
            1 if sparse_mode else (num_questions - len(output))
        )
        output += _output
        if num_questions and len(output) >= num_questions:
            return {'questions': output}
    return {'questions': output}
    #

def generate_mcq_one(sent, cache, num_questions):
    sent['text'] = sent.get("resolved", sent['original'])
    _output = []
    for phrase, label, rank in sent['ents']:
        if rank == 0:
            continue
        if label not in NER_LABELS:
            continue
        if phrase in cache:
            continue
        if in_parentheses_or_brackets(phrase, sent['text']): 
            continue

        # try to generate a question:
        question = generate_question(phrase, sent['text'], max_length=64)

        # filter bad output:
        if not question.endswith('?') or phrase.lower() in question.lower():
            continue
        # experimental:
        if not any(phrase in question for phrase, _, rank in sent['ents'] if rank != 0):
            continue          
        answer = get_answer(question, sent['text'], nbest=10, null_threshold=-3.76, for_mcq=True)
        if not answer:
            continue
        if answer not in phrase and phrase not in answer:
            continue

        final_answer = phrase if len(phrase) >= len(answer) else answer
        if len(final_answer.replace("-"," ").split()) < 3:
            continue
        
        # append MCQ:
        _output.append({
            "context_missing": sent['text'].replace(phrase, "___???___"),
            "question": question,
            "answer": final_answer
        })
        cache.append(phrase) # optional
        num_questions -= 1
        if num_questions == 0:
            break
    return _output,cache
    #

import re

# ------------------------ HELPER ------------------------ #

def in_parentheses_or_brackets(span, sentence):
    matches_parentheses = re.findall('\(.*?\)',sentence)
    matches_brackets = re.findall('\[.*?\]',sentence)
    # print(matches_parentheses, matches_brackets)
    for match in matches_parentheses + matches_brackets:
        if span in match:
            return True
    return False


def generate_question(answer, context, max_length=64):
    input_text = "answer: %s  context: %s </s>" % (answer, context)
    features = tokenizer_question_generation([input_text], return_tensors='pt')

    output = model_question_generation.generate(input_ids=features['input_ids'], 
                attention_mask=features['attention_mask'],
                max_length=max_length)

    return tokenizer_question_generation.decode(output[0], skip_special_tokens=True).replace("question:","").strip()

  from .autonotebook import tqdm as notebook_tqdm
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can saf

In [None]:
sections = [" The Internet has become a forum for terrorist groups and individual terrorists to spread their messages of hate and violence. Cyberterrorism is certainly on the terrorists’ agenda and is likely to become their new mode of operation. The various uses of the online platforms by terrorists can be grouped into communicative uses and instrumental uses. Terrorists use the cyberspace for instrumental purposes that include the teaching and training of terrorists online, and establishing ‘virtual training camps” for future assailants online, among other uses. The use of cyberterrorism has become known as cyberterrorism or cyberwarfare. Terrorism has often been conceptualized as a form of psychological warfare. Terrorism has certainly sought to wage such a campaign through the Internet. Terrorists can use the Internet to spread threats intended to distill fear and helplessness. Al-Qaeda has consistently claimed on its websites that the 9/11 destruction of the World Trade Center has inflicted psychological damage, as well as concrete damage, on the U.S. economy. The Internet is particularly well suited to allowing even a small group to amplify its message and exaggerate its importance and threat it poses. One of the primary uses of online communication by terrorists is for the dissemination of propaganda. This generally takes the form of multimedia communications providing ideological, political or religious explanations, justifications, or promotion of terrorist activities. These may include online messages, streaming videos of preaching, social media messages, and even video games developed by terrorist organizations. Terrorist propaganda distributed via the Internet covers a range of objectives and audiences. The fact that many terrorists now have direct control over the content of their message offers further opportunities to shape how they are perceived by different target audiences. Most terrorist online propaganda does not celebrate their violent activities. Terrorist messages emphasize restrictions placed on freedom of expression and plight of comrades who are now political prisoners. These issues resonate powerfully with their own supporters and are also calculated to elicit sympathy from Western audiences. Enemy publics may be targets for these complaints, by emphasizing the antidemocratic nature of the steps taken against them, try to create feelings of unease and shame among their foes."]
import spacy
nlp = spacy.load("en_core_web_lg")
generate_mcq([nlp(sections[0])])


In [9]:
import google.protobuf  as fff
dir(fff)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'descriptor',
 'descriptor_database',
 'descriptor_pool',
 'internal',
 'message',
 'message_factory',
 'pyext',
 'reflection',
 'symbol_database',
 'text_encoding']

In [10]:
fff.__version__

'4.21.12'