In [None]:
import random
import torch
from transformers import BertTokenizer, BertForMaskedLM
from nltk.corpus import wordnet as wn
import nltk

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
class QuestionGenerator:
    def __init__(self, passage, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForMaskedLM.from_pretrained(model_name)
        self.passage = passage

    def generate_fill_in_the_blanks(self, sentences, num_questions):
        fill_in_the_blanks_dict = {}
        used_sentences = set()
        while len(fill_in_the_blanks_dict) < num_questions:
            selected_sentence = random.choice(sentences)
            if selected_sentence in used_sentences:
                continue
            used_sentences.add(selected_sentence)
            if not selected_sentence.endswith('.'):
                selected_sentence += '.'
            tokens = self.tokenizer.tokenize(selected_sentence)
            masked_idx = random.choice([i for i, token in enumerate(tokens) if token.isalpha()])
            masked_word = tokens[masked_idx]
            tokens[masked_idx] = '[MASK]'
            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            input_ids = torch.tensor([input_ids])
            with torch.no_grad():
                outputs = self.model(input_ids)
                predictions = outputs.logits
            masked_idx_tensor = torch.tensor([masked_idx])
            predicted_token_ids = torch.topk(predictions[0, masked_idx_tensor], 5).indices[0].tolist()
            predicted_tokens = self.tokenizer.convert_ids_to_tokens(predicted_token_ids)
            sentence_with_blank = self.tokenizer.convert_tokens_to_string(tokens).replace('[MASK]', '***')
            fill_in_the_blanks_dict[masked_word] = sentence_with_blank
        return fill_in_the_blanks_dict

    def get_random_word(self):
        synsets = list(wn.all_synsets())
        random_synset = random.choice(synsets)
        return random_synset.lemmas()[0].name()

    def generate_mcqs(self, sentences, num_questions):
        mcq_questions_list = []
        used_sentences = set()
        while len(mcq_questions_list) < num_questions:
            selected_sentence = random.choice(sentences)
            if selected_sentence in used_sentences:
                continue
            used_sentences.add(selected_sentence)
            if not selected_sentence.endswith('.'):
                selected_sentence += '.'
            tokens = self.tokenizer.tokenize(selected_sentence)
            masked_idx = random.choice([i for i, token in enumerate(tokens) if token.isalpha()])
            masked_word = tokens[masked_idx]
            tokens[masked_idx] = '[MASK]'
            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            input_ids = torch.tensor([input_ids])
            with torch.no_grad():
                outputs = self.model(input_ids)
                predictions = outputs.logits
            masked_idx_tensor = torch.tensor([masked_idx])
            predicted_token_ids = torch.topk(predictions[0, masked_idx_tensor], 5).indices[0].tolist()
            predicted_tokens = self.tokenizer.convert_ids_to_tokens(predicted_token_ids)
            options = [masked_word]
            while len(options) < 4:
                random_word = self.get_random_word()
                if random_word not in options and random_word.isalpha():
                    options.append(random_word)
            random.shuffle(options)
            correct_index = options.index(masked_word)
            sentence_with_blank = self.tokenizer.convert_tokens_to_string(tokens).replace('[MASK]', '_______')
            mcq_questions_list.append([correct_index, sentence_with_blank, options])
        return mcq_questions_list

In [None]:
passage = '''The old library, with its towering shelves and dim lighting, was a sanctuary for those who sought knowledge. Dusty books, filled with forgotten lore, lined every wall, their spines cracked and titles faded. In the heart of this literary maze, a young scholar named Anna spent her days immersed in study. She had a particular interest in ancient civilizations, pouring over texts that described lost cities and enigmatic cultures. One day, she stumbled upon a tome unlike any she had seen before. Its cover was adorned with intricate designs, and its pages were made of a material that felt almost otherworldly. As she delved into the book, she discovered it contained the secrets of a long-forgotten society, one that had mastered technologies far beyond the reach of modern science. Excited by her find, Anna decided to decipher the text, hoping to unlock the mysteries of the past. Days turned into weeks, and weeks into months, as she painstakingly translated the ancient script. The deeper she went, the more she realized the magnitude of her discovery. This was no ordinary society; it was a civilization that had thrived in harmony with nature, using sustainable practices that could revolutionize the present world. Her excitement grew with each revelation, and she knew she had to share her findings with the world. With her notes in hand, Anna prepared to publish her work, eager to enlighten others about the incredible wisdom of the ancients.'''
def main():

    sentences = [sentence.strip() for sentence in passage.split('. ') if sentence]

    qg = QuestionGenerator(passage)

    # Generate fill-in-the-blank questions
    fill_in_the_blanks_questions = qg.generate_fill_in_the_blanks(sentences, num_questions=5)
    print(type(fill_in_the_blanks_questions))
    print("Fill-in-the-Blank Questions Dictionary:")
    print(fill_in_the_blanks_questions)

    # Generate MCQs
    mcq_questions = qg.generate_mcqs(sentences, num_questions=5)
    print(type(mcq_questions))
    print("\nMultiple Choice Questions List of Lists:")
    print(mcq_questions)

In [None]:
if __name__ == "__main__":
    main()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<class 'dict'>
Fill-in-the-Blank Questions Dictionary:
{'the': 'excited by her find , anna decided to decipher the text , hoping to unlock the mysteries of *** past .', 'that': 'as she delved into the book , she discovered it contained the secrets of a long - forgotten society , one *** had mastered technologies far beyond the reach of modern science .', 'for': 'the old library , with its towering shelves and dim lighting , was a sanctuary *** those who sought knowledge .', 'lore': 'dusty books , filled with forgotten *** , lined every wall , their spines cracked and titles faded .', 'in': 'with her notes *** hand , anna prepared to publish her work , eager to enlighten others about the incredible wisdom of the ancients .'}
<class 'list'>

Multiple Choice Questions List of Lists:
[[3, 'days turned into weeks , and weeks into months , as she painstakingly translated the _______ script .', ['Diplopoda', 'placentation', 'heavily', 'ancient']], [2, 'this was no ordinary society ; it was a 