# **QUIZZABLE TRY**

# **Dependency installation**

In [3]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install pdfplumber

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting pdfplumber
  Downloading pdfplumber-0.11.1-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━

# **Library Import**

In [6]:
import spacy
import random
import pdfplumber
from collections import Counter

# **Data Extraction**

In [7]:
with pdfplumber.open('eng.pdf') as pdf:
  text = ""
  for page in pdf.pages:
    text+= page.extract_text()

print(text)

Universal Declaration of Human Rights
Preamble
Whereas recognition of the inherent dignity and of the equal and inalienable
rights of all members of the human family is the foundation of freedom, justice
and peace in the world,
Whereas disregard and contempt for human rights have resulted in barbarous
acts which have outraged the conscience of mankind, and the advent of a world
in which human beings shall enjoy freedom of speech and belief and freedom
from fear and want has been proclaimed as the highest aspiration of the common
people,
Whereas it is essential, if man is not to be compelled to have recourse, as a last
resort, to rebellion against tyranny and oppression, that human rights should be
protected by the rule of law,
Whereas it is essential to promote the development of friendly relations between
nations,
Whereas the peoples of the United Nations have in the Charter reaffirmed their
faith in fundamental human rights, in the dignity and worth of the human person
and in the equ

**LOADING SPACY NLP**

In [8]:
nlp = spacy.load('en_core_web_sm')
nlp(text)

Universal Declaration of Human Rights
Preamble
Whereas recognition of the inherent dignity and of the equal and inalienable
rights of all members of the human family is the foundation of freedom, justice
and peace in the world,
Whereas disregard and contempt for human rights have resulted in barbarous
acts which have outraged the conscience of mankind, and the advent of a world
in which human beings shall enjoy freedom of speech and belief and freedom
from fear and want has been proclaimed as the highest aspiration of the common
people,
Whereas it is essential, if man is not to be compelled to have recourse, as a last
resort, to rebellion against tyranny and oppression, that human rights should be
protected by the rule of law,
Whereas it is essential to promote the development of friendly relations between
nations,
Whereas the peoples of the United Nations have in the Charter reaffirmed their
faith in fundamental human rights, in the dignity and worth of the human person
and in the equ

**PROCESSING TEXT WITH SPACY**

In [10]:
doc = nlp(text)
num_questions = 10

sentences = [sent.text for sent in doc.sents]
sentences

# Randomly select sentences to form questions
selected_sentences=random.sample(sentences,min(num_questions,len(sentences)))
selected_sentences
# min(num_questions,len(sentences))

['2.',
 '2. Motherhood and childhood are entitled to special care and assistance.',
 'Article 6\nEveryone has the right to recognition everywhere as a person before the law.\n',
 'Everyone has the right to freedom of movement and residence within the\nborders of each State.\n',
 'This right may not be invoked in the case of prosecutions genuinely\narising from non-political crimes or from acts contrary to the purposes and\nprinciples of the United Nations.\n',
 'The will of the people shall be the basis of the authority of government;\nthis will shall be expressed in periodic and genuine elections which shall\nbe by universal and equal suffrage and shall be held by secret vote or by\nequivalent free voting procedures.\n',
 'Article 5\nNo one shall be subjected to torture or to cruel, inhuman or degrading treatment\nor punishment.',
 'Article 23\n1.',
 'Article 20\n1.',
 '3. Parents have a prior right to choose the kind of education that shall be\ngiven to their children.\n']

# **QUESTION PROCESSING**

In [11]:
mcqs = []
# Generate MCQs for each selected sentence
for sentence in selected_sentences:
  sentence = sentence.lower() #yo garena bhane capital wala noun dekhaudaina
  # process with spacy (sentence)
  sent_doc = nlp(sentence)
  # Extract entities(nouns) from sentence
  nouns = [token.text for token in sent_doc if token.pos_ == "NOUN"] #pos = parts of speech

  # to generate proper noun
  if len(nouns) < 2:
    continue

  # Count the occurrence of each noun
  noun_counts = Counter(nouns)
  # print(noun_counts)


  # extract subject from the noun list
  if noun_counts:
    subject = noun_counts.most_common(1)[0][0]
    # print(subject)

    answer_choices = [subject]

    # Generate question in a question way
    question_stem = sentence.replace(subject,"_________")
    # print(question_stem)

    for _ in range(3):
      distractor = random.choice(list(set(nouns) -  set([subject]))) #for unique we used set
      answer_choices.append(distractor)
    # print(answer_choices)

    random.shuffle(answer_choices) #so that it wont be obvious that 1st one is the answer

    correct_answer = chr(64 + answer_choices.index(subject) + 1) # to conver index to letter, we can use this formula

    mcqs.append((question_stem,answer_choices,correct_answer))

# **GENERATION OF MCQ FULL**

In [12]:

def generate_mcqs(text, num_questions=20):
    # text = clean_text(text)
    if text is None:
        return []

    # Process the text with spaCy
    doc = nlp(text)

    # Extract sentences from the text
    sentences = [sent.text for sent in doc.sents]

    # Randomly select sentences to form questions
    selected_sentences = random.sample(sentences, min(num_questions, len(sentences)))

    # Initialize list to store generated MCQs
    mcqs = []

    # Generate MCQs for each selected sentence
    for sentence in selected_sentences:
        # Process the sentence with spaCy
        sent_doc = nlp(sentence)

        # Extract entities (nouns) from the sentence
        nouns = [token.text for token in sent_doc if token.pos_ == "NOUN"]

        # Ensure there are enough nouns to generate MCQs
        if len(nouns) < 2:
            continue

        # Count the occurrence of each noun
        noun_counts = Counter(nouns)

        # Select the most common noun as the subject of the question
        if noun_counts:
            subject = noun_counts.most_common(1)[0][0]

            # Generate the question stem
            question_stem = sentence.replace(subject, "_______")

            # Generate answer choices
            answer_choices = [subject]

            # Add some random words from the text as distractors

            distractors = list(set(nouns) - set([subject]))

            # Ensure there are at least three distractors
            while len(distractors) < 3:
                distractors.append("[Distractor]") # Placeholder for missing distractors

            random.shuffle(distractors)
            for distractor in distractors[:3]:
                answer_choices.append(distractor)

            # Shuffle the answer choices
            random.shuffle(answer_choices)

            # Append the generated MCQ to the list
            correct_answer = chr(64 + answer_choices.index(subject) + 1)  # Convert index to letter
            mcqs.append((question_stem, answer_choices, correct_answer))

    return mcqs




In [None]:
print(mcqs)

[('The use of _______ maps can be a promising opportunity to address the\nissue.', ['interactive', 'from', 'bring', 'this'], 'A'), ('Physical maps may lack detail and fail to\nprovide recent changes in campus _______ and infrastructure.', ['buildings', 'students', 'shift', 'help'], 'A'), ('Asking for directions from staff\nand students may lead to confusion, as not everyone is familiar with all the areas within the\n_______.', ['cities', 'nobody', 'officers', 'campus'], 'D'), ('The feasibility and potential benefits of integrating Interactive map _______ in the\ncampus to improve wayfinding for TUP Manila students.\n', ['pension', 'maps', 'expand', 'technology'], 'D'), ('Traditional methods of wayfinding, _______ as physical maps and verbal directions,\nare proven to be unreliable.\n', ['such', 'directions', 'from', 'note-taking'], 'A'), ('The effectiveness of existing digital navigation _______, if any, in addressing the navigation\nchallenges faced by TUP Manila students.\n', ['obsta

In [13]:
mcqs = generate_mcqs(text, num_questions=20)  # Pass the selected number of questions
# Ensure each MCQ is formatted correctly as (question_stem, answer_choices, correct_answer)
mcqs_with_index = [(i + 1, mcq) for i, mcq in enumerate(mcqs)]

for question in mcqs_with_index:
    print("Question", question[0], ":", question[1][0])
    print("Options:")
    options = question[1][1]
    for i, option in enumerate(options):
        print(f"{chr(97 + i)}) {option}")
    print("Correct Answer:", question[1][2])
    print("\n")

Question 1 : Article 19
Everyone has the _______ to freedom of opinion and expression; this _______ includes
freedom to hold opinions without interference and to seek, receive and impart
information and ideas through any media and regardless of frontiers.

Options:
a) media
b) interference
c) freedom
d) right
Correct Answer: D


Question 2 : No one shall be held guilty of any penal _______ on account of any act or
omission which did not constitute a penal _______, under national or
international law, at the time when it was committed.
Options:
a) offence
b) act
c) law
d) account
Correct Answer: A


Question 3 : Article 7
All are equal before the _______ and are entitled without any discrimination to equal
protection of the _______.
Options:
a) Article
b) protection
c) discrimination
d) law
Correct Answer: D


Question 4 : All are entitled to equal protection against any
_______ in violation of this Declaration and against any incitement to such
_______.

Options:
a) discrimination
b) p

In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
import random
import spacy
import pdfplumber
from nltk.corpus import wordnet
import gensim.downloader as api

nlp = spacy.load("en_core_web_sm")
word_vectors = api.load("glove-wiki-gigaword-100")

def extract_text_from_pdf(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def get_synonyms(word):
    try:
        similar_words = word_vectors.most_similar(word, topn=10)
        synonyms = [word] + [w[0] for w in similar_words if w[0] != word]
        return synonyms
    except KeyError:
        return [word]

def is_significant_word(word):
    # Add your criteria for significant words here
    return len(word) > 3 and word.lower() not in ["the", "and", "of"]  # Example criteria

def generate_mcqs(text, num_questions=20, difficulty_levels=None):
    if text is None:
        return []

    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    selected_sentences = random.sample(sentences, min(num_questions, len(sentences)))
    mcqs = []

    for sentence in selected_sentences:
        sent_doc = nlp(sentence)

        # Extract named entities (NEs)
        named_entities = [ent.text for ent in sent_doc.ents if ent.label_ in ["PERSON", "ORG", "GPE"]]
        named_entities = [ne for ne in named_entities if is_significant_word(ne)]

        # Extract nouns, verbs, and adjectives from the sentence
        nouns = [token.text for token in sent_doc if token.pos_ == "NOUN" and is_significant_word(token.text)]
        verbs = [token.text for token in sent_doc if token.pos_ == "VERB" and is_significant_word(token.text)]
        adjectives = [token.text for token in sent_doc if token.pos_ == "ADJ" and is_significant_word(token.text)]

        # Combine all significant words including NEs
        significant_words = named_entities + nouns + verbs + adjectives

        # Choose a random word from significant words for question generation
        if len(significant_words) < 1:
            continue
        subject = random.choice(significant_words)
        question_stem = sentence.replace(subject, "_______")

        # Check if the question length is too long or it contains multiple blanks
        if len(question_stem.split("_______")) > 2 or len(question_stem) > 150:
            continue

        # Generate answer choices
        answer_choices = [subject]

        # Enhance distractor selection
        distractors = []
        for token in sent_doc:
            if token.text.lower() != subject.lower() and token.text.lower() not in answer_choices:
                synonyms = get_synonyms(token.text)
                synonyms = [syn for syn in synonyms if is_significant_word(syn)]
                distractors.extend(synonyms)

        distractors = list(set(distractors))  # Remove duplicates
        distractors = [d for d in distractors if d.lower() not in answer_choices]  # Remove answer choices
        random.shuffle(distractors)
        for distractor in distractors[:3]:
            answer_choices.append(distractor)

        # Shuffle the answer choices
        random.shuffle(answer_choices)

        # Append the generated MCQ to the list
        correct_answer = chr(65 + answer_choices.index(subject))  # Convert index to letter
        mcqs.append((question_stem, answer_choices, correct_answer))

    return mcqs

# Example usage:
pdf_file = "eng.pdf"  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_file)
mcqs = generate_mcqs(text, num_questions=20)
for i, mcq in enumerate(mcqs):
    print(f"Question {i+1}: {mcq[0]}")
    print("Options:")
    for j, option in enumerate(mcq[1]):
        print(f"{chr(65 + j)}: {option}")
    print("Correct Answer:", mcq[2])
    print()


Question 1: No one may be compelled to _______ to an association.

Options:
A: belong
B: likely
C: ignore
D: organization
Correct Answer: A

Question 2: They are
_______ with reason and conscience and should act towards one another in a
spirit of brotherhood.

Options:
A: endowed
B: reaching
C: fact
D: towards
Correct Answer: A

Question 3: Everyone, without any discrimination, has the right to equal pay for equal
_______.

Options:
A: already
B: take
C: receive
D: work
Correct Answer: D

Question 4: Everyone has the right of equal access to public _______ in his country.

Options:
A: well
B: since
C: service
D: provide
Correct Answer: C

Question 5: Everyone has
the right to the protection of the law against such _______ or attacks.

Options:
A: attack
B: interference
C: those
D: would
Correct Answer: B

Question 6: Article 6
Everyone has the right to recognition everywhere as a _______ before the law.

Options:
A: after
B: back
C: person
D: respect
Correct Answer: C

Question 7: Ever

# **OPTIMIZED**

In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install pdfplumber
!pip install PyDictionary

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[31mERROR: Could not find a version that satisfies the requirement collections (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for collections[0m[31m


**Library Import**

In [7]:
#library import
import spacy
import random
import pdfplumber
import nltk
from nltk.corpus import wordnet

**PDF Extract**

In [None]:

# Load the text from the PDF file
with pdfplumber.open('eng.pdf') as pdf:
    text = ""
    for page in pdf.pages:
        text += page.extract_text()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Download WordNet data if not already downloaded
nltk.download('wordnet')

**Get Synonymns for Answers**

In [None]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym != word:
                synonyms.add(synonym)
    return list(synonyms)

**Question Generate**

In [6]:

def generate_mcqs(text, num_questions=20):
    if text is None:
        return []

    # Process the text with spaCy
    doc = nlp(text)

    # Extract sentences from the text and filter out unwanted sentences
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10 and not any(char.isdigit() for char in sent.text.strip())]

    # Initialize set to store generated questions and answers
    generated_questions = set()

    # Initialize list to store generated MCQs
    mcqs = []

    # Generate MCQs until we reach the desired number
    while len(mcqs) < num_questions:
        # Randomly select a sentence to form a question
        sentence = random.choice(sentences)

        # Skip sentences that are too long
        if len(sentence) > 200:
            continue

        # Process the sentence with spaCy
        sent_doc = nlp(sentence)

        # Extract entities (nouns) from the sentence
        nouns = [token.text for token in sent_doc if token.pos_ == "NOUN"]

        # Ensure there are enough nouns to generate MCQs
        if len(nouns) < 1:
            continue

        # Select a random noun as the subject of the question
        subject = random.choice(nouns)

        # Generate the question stem
        question_stem = sentence.replace(subject, "_______", 1)

        # Check if the question has already been generated
        if (question_stem, subject) in generated_questions:
            continue

        # Generate answer choices
        answer_choices = [subject]

        # Get synonyms of the correct answer
        synonyms = get_synonyms(subject)

        # If there are not enough synonyms, add random nouns from the text
        if len(synonyms) < 3:
            all_nouns = [token.text for token in nlp(text) if token.pos_ == "NOUN" and token.text != subject]
            random_distractors = random.sample(all_nouns, 3 - len(synonyms))
            distractors = synonyms + random_distractors
        else:
            distractors = synonyms[:3]

        answer_choices.extend(distractors)

        # Shuffle the answer choices
        random.shuffle(answer_choices)

        # Remove duplicate answer choices
        answer_choices = list(set(answer_choices))

        # Ensure there are at least 4 unique answer choices
        while len(answer_choices) < 4:
            new_choice = random.choice([token.text for token in nlp(text) if token.pos_ == "NOUN" and token.text != subject])
            if new_choice not in answer_choices:
                answer_choices.append(new_choice)

        # Shuffle the answer choices again
        random.shuffle(answer_choices)

        # Append the generated MCQ to the list
        correct_answer = chr(64 + answer_choices.index(subject) + 1)  # Convert index to letter
        mcqs.append((question_stem, answer_choices, correct_answer))

        # Add the generated question to the set
        generated_questions.add((question_stem, subject))

    return mcqs

mcqs = generate_mcqs(text, num_questions=50)  # Pass the selected number of questions
# Ensure each MCQ is formatted correctly as (question_stem, answer_choices, correct_answer)
mcqs_with_index = [(i + 1, mcq) for i, mcq in enumerate(mcqs)]

for question in mcqs_with_index:
    print("Question", question[0], ":", question[1][0])
    print("Options:")
    options = question[1][1]
    for i, option in enumerate(options):
        print(f"{chr(97 + i)}) {option}")
    print("Correct Answer:", question[1][2])
    print("\n")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Question 1 : No one shall be arbitrarily deprived of his _______ nor denied the right to
change his nationality.
Options:
a) freedom
b) nationality
c) media
d) material
Correct Answer: B


Question 2 : They are entitled
to equal rights as to _______, during marriage and at its dissolution.
Options:
a) union
b) married couple
c) marriage
d) marriage ceremony
Correct Answer: C


Question 3 : Education shall be free, at least in the
elementary and fundamental _______.
Options:
a) degree
b) stages
c) microscope stage
d) snitch
Correct Answer: B


Question 4 : Everyone has the right to the _______ of the moral and material interests
resulting from any scientific, literary or artistic production of which he is the
author.
Options:
a) protective covering
b) security
c) aegis
d) protection
Correct Answer: D


Question 5 : No one may be compelled to belong to an _______.
Options:
a) connection
b) tie
c) association
d) connexion
Correct Answer: C


Question 6 : Everyone has the right to work, to

# **FULLY OPTIMIZED**


In [13]:
# Load the text from the PDF file
with pdfplumber.open('eng.pdf') as pdf:
    text = ""
    for page in pdf.pages:
        text += page.extract_text()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Download WordNet data if not already downloaded
nltk.download('wordnet')

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym != word:
                synonyms.add(synonym)
    return list(synonyms)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
def generate_true_false_question(entities):
    # Select a random entity for the statement
    statement_entity = random.choice(entities)
    # Generate the statement
    statement = f"{statement_entity} is a {random.choice(['person', 'organization', 'location'])}."
    # Set the correct answer randomly as True or False
    correct_answer = random.choice(['True', 'False'])
    return statement, ['True', 'False'], correct_answer


In [None]:
def generate_multiple_correct_answers_question(nouns):
    # Select a random noun as the subject of the question
    subject = random.choice(nouns)
    # Generate the question stem
    question_stem = f"What are related to {subject}?"
    # Get synonyms of the correct answer and similar words
    synonyms = get_synonyms(subject)
    similar_words = [token.text for token in nlp.vocab if token.is_alpha and token.has_vector and token.is_lower and token.similarity(nlp(subject)) > 0.5][:3]
    # Combine synonyms and similar words for distractors
    distractors = list(set(synonyms + similar_words))
    # Remove the correct answer from distractors
    distractors = [d for d in distractors if d != subject]
    # Ensure there are at least 3 distractors
    while len(distractors) < 3:
        new_distractor = random.choice([token.text for token in nlp(text) if token.pos_ == "NOUN" and token.text != subject])
        if new_distractor not in distractors:
            distractors.append(new_distractor)
    # Add distractors to answer choices
    answer_choices = [subject] + random.sample(distractors, 3)
    # Set the correct answer(s) randomly
    correct_answers = random.sample(answer_choices, random.randint(1, 3))
    return question_stem, answer_choices, correct_answers


In [10]:
def generate_fill_in_the_blank_question(entities):
    # Select a random entity for the blank
    blank_entity = random.choice(entities)
    # Generate the question stem with a blank
    question_stem = f"Fill in the blank: {blank_entity} is located in __________."
    # Generate the correct answer based on the entity
    correct_answer = random.choice([ent.text for ent in doc.ents if ent.label_ == 'GPE'])
    return question_stem, [correct_answer], correct_answer


In [15]:
def generate_mcqs(text, num_questions=20):
    if text is None:
        return []

    # Process the text with spaCy
    doc = nlp(text)

    # Extract entities (nouns and named entities) from the text
    entities = [ent.text for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'GPE', 'LOC']]
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]

    # Initialize list to store generated MCQs
    mcqs = []

    # Generate MCQs until we reach the desired number
    while len(mcqs) < num_questions:
        # Randomly select a type of question (True/False, Multiple Correct Answers, Fill-in-the-Blank)
        question_type = random.choice(['True/False', 'Multiple Correct Answers', 'Fill-in-the-Blank'])

        if question_type == 'True/False':
            mcqs.append(generate_true_false_question(entities))

        elif question_type == 'Multiple Correct Answers':
            mcqs.append(generate_multiple_correct_answers_question(nouns))

        elif question_type == 'Fill-in-the-Blank':
            mcqs.append(generate_fill_in_the_blank_question(entities))

    return mcqs


In [16]:
mcqs = generate_mcqs(text, num_questions=20)  # Pass the selected number of questions
# Ensure each MCQ is formatted correctly as (question_stem, answer_choices, correct_answer)
mcqs_with_index = [(i + 1, mcq) for i, mcq in enumerate(mcqs)]

for question in mcqs_with_index:
    print("Question", question[0], ":", question[1][0])
    print("Options:")
    options = question[1][1]
    for i, option in enumerate(options):
        print(f"{chr(97 + i)}) {option}")
    print("Correct Answer(s):", question[1][2])
    print("\n")


NameError: name 'doc' is not defined

In [9]:
import spacy
import random
import pdfplumber
import nltk
from nltk.corpus import wordnet

# Load the text from the PDF file
with pdfplumber.open('eng.pdf') as pdf:
    text = ""
    for page in pdf.pages:
        text += page.extract_text()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Download WordNet data if not already downloaded
nltk.download('wordnet')

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym != word:
                synonyms.add(synonym)
    return list(synonyms)

def generate_mcqs(text, num_questions=20):
    if text is None:
        return []

    # Process the text with spaCy
    doc = nlp(text)

    # Extract entities (nouns and named entities) from the text
    entities = [ent.text for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'GPE', 'LOC']]
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]

    # Initialize set to store generated questions and answers
    generated_questions = set()

    # Initialize list to store generated MCQs
    mcqs = []

    # Generate MCQs until we reach the desired number
    while len(mcqs) < num_questions:
        # Randomly select a type of question (True/False, Multiple Correct Answers, Fill-in-the-Blank)
        question_type = random.choice(['True/False', 'Multiple Correct Answers', 'Fill-in-the-Blank'])

        if question_type == 'True/False':
            # Select a random entity for the statement
            statement_entity = random.choice(entities)
            # Generate the statement
            statement = f"{statement_entity} is a {random.choice(['person', 'organization', 'location'])}."
            # Set the correct answer randomly as True or False
            correct_answer = random.choice(['True', 'False'])
            # Add the generated MCQ to the list
            mcqs.append((statement, ['True', 'False'], correct_answer))

        elif question_type == 'Multiple Correct Answers':
            # Select a random noun as the subject of the question
            subject = random.choice(nouns)
            # Generate the question stem
            question_stem = f"What are related to {subject}?"
            # Get synonyms of the correct answer and similar words
            synonyms = get_synonyms(subject)
            similar_words = [token.text for token in nlp.vocab if token.is_alpha and token.has_vector and token.is_lower and token.similarity(nlp(subject)) > 0.5][:3]
            # Combine synonyms and similar words for distractors
            distractors = list(set(synonyms + similar_words))
            # Remove the correct answer from distractors
            distractors = [d for d in distractors if d != subject]
            # Ensure there are at least 3 distractors
            while len(distractors) < 3:
                new_distractor = random.choice([token.text for token in nlp(text) if token.pos_ == "NOUN" and token.text != subject])
                if new_distractor not in distractors:
                    distractors.append(new_distractor)
            # Add distractors to answer choices
            answer_choices = [subject] + random.sample(distractors, 3)
            # Set the correct answer(s) randomly
            correct_answers = random.sample(answer_choices, random.randint(1, 3))
            # Add the generated MCQ to the list
            mcqs.append((question_stem, answer_choices, correct_answers))

        elif question_type == 'Fill-in-the-Blank':
            # Select a random entity for the blank
            blank_entity = random.choice(entities)
            # Generate the question stem with a blank
            question_stem = f"Fill in the blank: {blank_entity} is located in __________."
            # Generate the correct answer based on the entity
            correct_answer = random.choice([ent.text for ent in doc.ents if ent.label_ == 'GPE'])
            # Add the generated MCQ to the list
            mcqs.append((question_stem, [correct_answer], correct_answer))

    return mcqs

mcqs = generate_mcqs(text, num_questions=20)  # Pass the selected number of questions
# Ensure each MCQ is formatted correctly as (question_stem, answer_choices, correct_answer)
mcqs_with_index = [(i + 1, mcq) for i, mcq in enumerate(mcqs)]

for question in mcqs_with_index:
    print("Question", question[0], ":", question[1][0])
    print("Options:")
    options = question[1][1]
    for i, option in enumerate(options):
        print(f"{chr(97 + i)}) {option}")
    print("Correct Answer(s):", question[1][2])
    print("\n")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


NameError: name 'doc' is not defined

# **OTHER VERSION**

In [None]:
!pip install PyMuPDF
!pip install spacy
!pip install transformers


Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 5, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main.py", line 10, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/autocompletion.py", line 10, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main_parser.py", line 9, in <module>
    from pip._internal.build_env import get_runnable_pip
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/build_env.py", line 19, in <module>
    from pip._internal.cli.spinners import open_spinner
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/spinners.py", line 9, in <module>
    from pip._internal.utils.logging import get_indentation
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/util

In [None]:
import fitz  # PyMuPDF for text extraction
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Step 2: Preprocess Text
def preprocess_text(text):
    return text.replace('\n', ' ').strip()

# Step 3: Load Pre-trained Model
model_name = "valhalla/t5-base-qg-hl"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Step 4: Generate Questions with Answers
def generate_questions_with_answers(text, num_questions):
    input_text = "generate questions: " + text + " </s>"
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=150, num_beams=5, num_return_sequences=num_questions, early_stopping=True)
    questions_with_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return questions_with_answers

# Step 5: Create Quiz with Answers
def create_quiz_with_answers_from_pdf(pdf_path, num_questions):
    text = extract_text_from_pdf(pdf_path)
    clean_text = preprocess_text(text)
    questions_with_answers = generate_questions_with_answers(clean_text, num_questions)
    return questions_with_answers

# Example Usage
pdf_path = "/content/poetrywriting_Torres.pdf"
num_questions = 5
quiz_with_answers = create_quiz_with_answers_from_pdf(pdf_path, num_questions)
for idx, question_with_answer in enumerate(quiz_with_answers, 1):
    if '|' in question_with_answer:
        question, answer = question_with_answer.split('|')
        print(f"Q{idx}: {question}")
        print(f"A{idx}: {answer}")
    else:
        print(f"Q{idx}: {question_with_answer}")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Q1: What is the name of Pyeow House's Haiku?
Q2: What is the name of the Haiku by Pyeow House?
Q3: What is Pyeow House's Haiku about?
Q4: What is Pyeow House's Haiku?
Q5: What is Pyeow House's Haiku called?
