In [None]:
!pip install -U transformers optimum onnx onnxruntime
!pip install evaluate

In [None]:
!pip install evaluate
!pip uninstall torch torchvision -y
!pip install torch torchvision
!pip install datasets
!pip install --upgrade transformers
import json
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from google.colab import drive
import evaluate
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split
import torchvision
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
from transformers import AutoConfig
from transformers import LogitsProcessorList, MinLengthLogitsProcessor
from uuid import uuid4
from optimum.exporters.onnx import main_export
from pathlib import Path

In [1]:
with open('/content/BUE_ICS_AI_Dataset_NLP QG.json', 'r') as f:
    dataset = json.load(f)

# Creating a new list to store the modified dataset
modified_dataset = []

# Looping through each item in the original dataset and modify the structure
for item in dataset:

    context = item.get("input")  # 'input' is the passage
    question = item.get("output")  # 'output' is the question
    instruction = item.get("instruction")  # 'instruction' remains as is

    modified_item = {
        "context": context,
        "question": question,
        "instruction": instruction
    }

    modified_dataset.append(modified_item)

with open('modified_dataset.json', 'w') as f:
    json.dump(modified_dataset, f, indent=4)

print("Dataset has been successfully modified and saved to 'modified_dataset.json'")

Dataset has been successfully modified and saved to 'modified_dataset.json'


# **Loading the modified dataset**

In [5]:
with open('modified_dataset.json', 'r') as f:
    modified_dataset = json.load(f)

for i, item in enumerate(modified_dataset[:5]):
    print(f"Entry {i+1}:")
    print(f"Context: {item.get('context')}")
    print(f"Question: {item.get('question')}")
    print(f"Instruction: {item.get('instruction')}")
    print("-" * 40)

print(f"Total number of entries in the modified dataset: {len(modified_dataset)}")

Entry 1:
Context: NLP stands for Natural Language Processing, which involves enabling computers to understand and generate human language.
Question: What does NLP stand for in the field of Artificial Intelligence?
A. Natural Logic Processing
B. Natural Language Processing
C. Neural Linguistic Programming
D. Natural Level Programming
Instruction: Generate a multiple-choice question based on the following passage.
----------------------------------------
Entry 2:
Context: One of the core goals of NLP is to allow machines to understand, interpret, and generate human language.
Question: True or False: NLP allows machines to interact using human language.
Instruction: Generate a true or false question based on the following passage.
----------------------------------------
Entry 3:
Context: Text lacks components like visual perception, emotion, and interaction with the physical world, which are vital for full intelligence.
Question: Why was text previously considered a limited source of inf

# **Preprocessing: Split the Dataset**

In [6]:
with open('modified_dataset.json', 'r') as f:
    dataset = json.load(f)

# Split the dataset into training (80%), evaluation and test (20%) datasets
train_data, temp_data = train_test_split(dataset, test_size=0.2, random_state=42)
eval_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

with open('train_data.json', 'w') as f:
    json.dump(train_data, f, indent=4)

with open('eval_data.json', 'w') as f:
    json.dump(eval_data, f, indent=4)

with open('test_data.json', 'w') as f:
    json.dump(test_data, f, indent=4)

print("Dataset split successfully into train, evaluation, and test sets.")

Dataset split successfully into train, evaluation, and test sets.


# **Load the Model and Tokenizer**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('frozenwalker/SciFive_pubmedqa_question_generation')
model = AutoModelForSeq2SeqLM.from_pretrained('frozenwalker/SciFive_pubmedqa_question_generation')

print("Model and tokenizer loaded successfully.")

tokenizer_config.json:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Model and tokenizer loaded successfully.


In [None]:
config = AutoConfig.from_pretrained("frozenwalker/SciFive_pubmedqa_question_generation")

# Check the model size
print(f"Model architecture: {config.architectures}")
print(f"Model size: {config.model_type}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters())}")

Model architecture: ['T5ForConditionalGeneration']
Model size: t5
Number of parameters: 222903552


# **Preprocessing for Fine-Tuning**

In [None]:
class QuestionGenerationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512): # max_length=512 was chosen based on the model's original configuration and the average context length in my dataset.
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item["context"]
        question = item["question"]

        # Tokenizing the input and output
        inputs = self.tokenizer(context, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
        labels = self.tokenizer(question, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")

        # Preparing the data for training
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        labels = labels['input_ids'].squeeze()

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# Creating dataset objects for training and evaluation
train_dataset = QuestionGenerationDataset(train_data, tokenizer)
eval_dataset = QuestionGenerationDataset(eval_data, tokenizer)

print("Datasets created successfully.")

Datasets created successfully.


# **Fine-Tuning the Model**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('frozenwalker/SciFive_pubmedqa_question_generation')
model = AutoModelForSeq2SeqLM.from_pretrained('frozenwalker/SciFive_pubmedqa_question_generation')
accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    logits, labels = pred
    preds = torch.argmax(torch.tensor(logits), dim=-1)
    return accuracy.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir='./results',
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=12,
    logging_dir='./logs',
    logging_steps=200,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model('./fine_tuned_model')
print("Model fine-tuning complete and saved.")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmostaphaabdulaziz132[0m ([33mmostaphaabdulaziz132-bue[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
200,3.2148
400,0.1323
600,0.1068
800,0.0946
1000,0.0906
1200,0.086
1400,0.0788
1600,0.0802
1800,0.0767
2000,0.0735


Model fine-tuning complete and saved.


# **Evaluation on test set**

In [None]:
try:
    drive.mount('/content/drive', force_remount=False)
    print("Google Drive mounted successfully")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    raise

print("Installing dependencies...")
!pip install transformers==4.38.2 evaluate datasets rouge_score --quiet

# Loading fine-tuned model and tokenizer
try:
    model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/SciFive_pubmedqa_Guestion_generation_finetuned")
    tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/SciFive_pubmedqa_Guestion_generation_finetuned")
    model.to("cuda")
    model.eval()
    print("Fine-tuned SciFive model and tokenizer loaded successfully")
except Exception as e:
    print(f"Error loading model/tokenizer: {e}")
    raise

# Loading test dataset
try:
    with open('/content/test_data.json', 'r') as f:
        test_data = json.load(f)
    print(f"Loaded test dataset with {len(test_data)} entries")
except Exception as e:
    print(f"Error loading test dataset: {e}")
    raise

# Loading evaluation metrics
try:
    bleu = evaluate.load("bleu")
    rouge = evaluate.load("rouge")
    print("Evaluation metrics loaded successfully")
except Exception as e:
    print(f"Error loading evaluation metrics: {e}")
    raise

# Generating questions and evaluate
results = []
bleu_scores = []
rouge_scores = []

for item in tqdm(test_data, desc="Generating questions"):
    context = item.get("context", "")
    ground_truth_question = item.get("question", "")

    # Preparing input (As SciFive expects context as input for question generation)
    input_text = context
    try:
        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to("cuda")
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            num_beams=5,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        if not generated_question.endswith("?"):
            generated_question += "?"

        bleu_score = bleu.compute(predictions=[generated_question], references=[[ground_truth_question]])
        rouge_score = rouge.compute(predictions=[generated_question], references=[ground_truth_question])

        results.append({
            "context": context,
            "ground_truth_question": ground_truth_question,
            "generated_question": generated_question,
            "bleu_score": bleu_score["bleu"],
            "rouge_score": rouge_score
        })
        bleu_scores.append(bleu_score["bleu"])
        rouge_scores.append(rouge_score["rougeL"])

    except Exception as e:
        print(f"Error generating question for context: {context[:50]}...: {e}")
        results.append({
            "context": context,
            "ground_truth_question": ground_truth_question,
            "generated_question": "Error",
            "bleu_score": 0.0,
            "rouge_score": {"rougeL": 0.0}
        })
        bleu_scores.append(0.0)
        rouge_scores.append(0.0)

avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0
avg_rougeL = sum(score for score in rouge_scores) / len(rouge_scores) if rouge_scores else 0.0

output_file = "/content/drive/MyDrive/test_results_scifive.json"
try:
    with open(output_file, "w") as f:
        json.dump({
            "results": results,
            "average_bleu": avg_bleu,
            "average_rougeL": avg_rougeL
        }, f, indent=4)
    print(f"Results saved to {output_file}")
except Exception as e:
    print(f"Error saving results: {e}")

print(f"Average BLEU Score: {avg_bleu:.4f}")
print(f"Average ROUGE-L Score: {avg_rougeL:.4f}")
print(f"Total questions generated: {len([r for r in results if r['generated_question'] != 'Error'])}/{len(test_data)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully
Installing dependencies...
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
Fine-tuned SciFive model and tokenizer loaded successfully
Loaded test dataset with 516 entries
Evaluation metrics loaded successfully


Generating questions: 100%|██████████| 516/516 [06:55<00:00,  1.24it/s]

Results saved to /content/drive/MyDrive/test_results_scifive.json
Average BLEU Score: 0.1503
Average ROUGE-L Score: 0.4175
Total questions generated: 516/516





# **Average BLEU Score: 0.1503**

**Interpretation:**

- A score of 0.1503 is relatively low, indicating that the generated questions have limited word-for-word overlap with the ground truth questions in test_data.json.

- This suggests the model is generating questions that differ significantly in wording or structure from the expected questions, even if they might be semantically correct.

- For example, if the ground truth is "What pumps blood throughout the body?" and the model generates "What organ circulates blood?", the BLEU score would be low due to different phrasing, despite similar meaning.

- BLEU is sensitive to exact matches, so a low score doesn't necessarily mean the questions are incorrect, just differently worded.

# **Average ROUGE-L Score: 0.4175**

**Interpretation:**

- A score of 0.4175 is moderate, indicating that the generated questions share some structural and semantic similarity with the ground truth questions.

- ROUGE-L is less strict than BLEU, focusing on shared sequences rather than exact n-grams, so the higher score suggests that the generated questions are capturing key concepts or phrases, even if the wording differs.

- For example, "What organ circulates blood?" and "What pumps blood throughout the body?" share "blood" and a question structure, contributing to a decent ROUGE-L score.

- The model is generating questions that are somewhat aligned with the ground truth in meaning or structure, which is positive.

In [None]:
def evaluate_model_on_metrics(dataset, model, tokenizer, metric):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move model to the correct device

    predictions = []
    references = []

    for example in dataset:
        context = example['context']
        question = example['question']

        # Tokenize and move input tensors to the same device as the model
        inputs = tokenizer(context, question, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        # Generate the output
        with torch.no_grad():
            outputs = model.generate(**inputs)

        # Decode predictions
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(predicted_text)

        # Wrap the reference in a list for BLEU
        references.append([question])

    # Compute the metric
    metric.add_batch(predictions=predictions, references=references)
    return metric.compute()


In [None]:
evaluation_results = evaluate_model_on_metrics(dataset, model, tokenizer, metric)

# Pretty print the output
import pprint
pprint.pprint(evaluation_results)

{'bleu': 0.17576599789323155,
 'brevity_penalty': 0.3515319957864631,
 'length_ratio': 0.4888888888888889,
 'precisions': [0.6818181818181818, 0.55, 0.4444444444444444, 0.375],
 'reference_length': 45,
 'translation_length': 22}


# **Evaluating Using METEOR Score**

In [9]:
try:
    drive.mount('/content/drive', force_remount=False)
    print("Google Drive mounted successfully")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    raise

model_path = "/content/drive/MyDrive/SciFive_pubmedqa_Guestion_generation_finetuned"
try:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path, local_files_only=True)
    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
    model.to("cuda")
    model.eval()
    print("Fine-tuned SciFive model and tokenizer loaded successfully from local path")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    raise

try:
    with open('/content/test_data.json', 'r') as f:
        test_data = json.load(f)
    if not test_data:
        raise ValueError("Test dataset is empty")
    print(f"Loaded test dataset with {len(test_data)} entries")
except Exception as e:
    print(f"Error loading test dataset: {e}")
    raise

# Loading METEOR metric
try:
    meteor = evaluate.load("meteor")
    print("METEOR metric loaded successfully")
except Exception as e:
    print(f"Error loading METEOR metric: {e}")
    raise

# Generating questions and evaluating with METEOR
meteor_scores = []
for item in tqdm(test_data, desc="Evaluating with METEOR"):
    context = item.get("context", "")
    ground_truth_question = item.get("question", "")

    if not context or not ground_truth_question:
        print(f"Skipping item due to missing context or question: {context[:50]}")
        meteor_scores.append(0.0)
        continue

    # Generating question
    input_text = context
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to("cuda")
    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            num_beams=5,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        if not generated_question.endswith("?"):
            generated_question += "?"
    except Exception as e:
        print(f"Error generating question for context {context[:50]}...: {e}")
        generated_question = "Error"
        meteor_scores.append(0.0)
        continue

    # Computing METEOR score
    try:
        score = meteor.compute(predictions=[generated_question], references=[[ground_truth_question]])["meteor"]
        meteor_scores.append(score)
    except Exception as e:
        print(f"Error computing METEOR score for {generated_question[:50]}...: {e}")
        meteor_scores.append(0.0)

avg_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0.0
avrage_meteor = avg_meteor
print(f"Average METEOR Score: {avrage_meteor:.4f}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully
Fine-tuned SciFive model and tokenizer loaded successfully from local path
Loaded test dataset with 516 entries


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


METEOR metric loaded successfully


Evaluating with METEOR: 100%|██████████| 516/516 [06:06<00:00,  1.41it/s]

Average METEOR Score: 0.9054





# **Evaluation Results for Fine-Tuned SciFive Model**

# Model Performance Metrics
- **Dataset Size**: 5,158 entries, split into 80% train (4,126), 10% eval (516), and 10% test (516).
- **Evaluation Metrics**:
  - **BLEU Score**: 0.1503
    - Indicates limited word-for-word overlap with ground truth questions, likely due to varied phrasing despite semantic correctness.
  - **ROUGE-L Score**: 0.4175
    - Suggests moderate structural and semantic similarity, effectively capturing key concepts.
  - **METEOR Score**: 0.9054
    - Reflects high semantic alignment and synonymy, complementing BLEU and ROUGE-L by accounting for paraphrased yet meaningful questions.
- **Total Questions Generated**: 516/516 (100% success rate with no errors).
- **Inference Time**: Approximately 6 minutes 55 seconds for BLEU/ROUGE-L evaluation, and 6 minutes 6 seconds for METEOR evaluation, averaging 1.24–1.41 questions per second.
- **Confidence Scores**: Range from 0.955 to 0.987 across test cases, indicating strong model certainty.

### Observations
- The high METEOR score (0.9054) suggests the model generates semantically accurate questions, addressing the limitation of BLEU's focus on exact matches.
- The stable generation of all 516 test questions with high confidence underscores the model's robustness on the given dataset.

# **Saving the model to the drive**

In [None]:
# Save model and tokenizer locally
model.save_pretrained("/content/SciFive_pubmedqa_Guestion_generation_finetuned")
tokenizer.save_pretrained("/content/SciFive_pubmedqa_Guestion_generation_finetuned")

from google.colab import drive
drive.mount('/content/drive')

!cp -r /content/SciFive_pubmedqa_Guestion_generation_finetuned /content/drive/MyDrive/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Saving as onnox**

In [None]:
# Set paths
model_path = "./SciFive_pubmedqa_Guestion_generation_finetuned"
output_path = Path("QG_pubmedaq_onnx_output/")

main_export(
    model_name_or_path=model_path,
    output=output_path,
    task="text2text-generation",
    opset=14  # This is the minimum required for T5
)

  if sequence_length != 1:
Could not find ONNX initializer for torch parameter decoder.embed_tokens.weight. decoder.embed_tokens.weight will not be checked for deduplication.
Could not find ONNX initializer for torch parameter encoder.embed_tokens.weight. encoder.embed_tokens.weight will not be checked for deduplication.
Found different candidate ONNX initializers (likely duplicate) for the tied weights:
	decoder.embed_tokens.weight: set() --> ignored (may be a parameter from a part of the model not exported)
	encoder.embed_tokens.weight: set() --> ignored (may be a parameter from a part of the model not exported)
	lm_head.weight: {'onnx::MatMul_2881'}
	shared.weight: {'shared.weight'}


# **Loading the model from drive**

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/SciFive_pubmedqa_Guestion_generation_finetuned")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/SciFive_pubmedqa_Guestion_generation_finetuned")

# **Inference with the Fine-Tuned Model**

In [None]:
def generate_question(model, tokenizer, context, max_length=100):
    # Tokenizing the context
    inputs = tokenizer(context, return_tensors="pt", padding='max_length', truncation=True, max_length=512)

    # Generating the question
    output = model.generate(
        inputs['input_ids'],
        max_length=max_length,
        num_beams=4,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    # Decoding and returning the generated question
    question = tokenizer.decode(output[0], skip_special_tokens=True)
    return question

# Test inference on the dataset
for item in test_data[:5]:  # Example with the first 5 items in the test dataset
    context = item["context"]
    print("Context:", context)
    generated_question = generate_question(model, tokenizer, context)
    print("Generated Question:", generated_question)
    print("-" * 80)

Context: Symmetric encryption uses the same key for both encryption and decryption, while asymmetric encryption uses a pair of keys: a public key for encryption and a private key for decryption.
Generated Question: What is the difference between symmetric and asymmetric encryption?
--------------------------------------------------------------------------------
Context: Non-maximum suppression is used to thin the edges by suppressing pixels that are not local maxima in the gradient direction, ensuring that only the most prominent edges are retained.
Generated Question: True or False: Non-maximum suppression thins edges by suppressing pixels that are not local maxima.
--------------------------------------------------------------------------------
Context: The average of token vectors may lose important syntactic and semantic information, leading to less effective sentence embeddings.
Generated Question: True or False: The average of token vectors may lose important syntactic and semant

In [None]:
def generate_advanced_question(model, tokenizer, context, max_length=100, question_type="auto"):

    inputs = tokenizer(context, return_tensors="pt", padding='max_length', truncation=True, max_length=512)

    logits_processor = LogitsProcessorList([
        MinLengthLogitsProcessor(10, eos_token_id=tokenizer.eos_token_id)
    ])

    output = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_beams=5,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        return_dict_in_generate=True,
        output_scores=True,
        logits_processor=logits_processor
    )

    question = tokenizer.decode(output.sequences[0], skip_special_tokens=True)

    # Calculating confidence (mean log prob of predicted tokens)
    scores = output.scores
    if scores:
        probs = [torch.nn.functional.softmax(score, dim=-1).max().item() for score in scores]
        avg_confidence = sum(probs) / len(probs)
    else:
        avg_confidence = None

    return {
        "context": context,
        "question": question,
        "confidence_score": round(avg_confidence, 3) if avg_confidence else "N/A",
        "question_type": question_type
    }

# Testing on more samples
results = []
for i, item in enumerate(test_data[:10]):  # Test on first 10
    context = item["context"]
    result = generate_advanced_question(model, tokenizer, context)
    print(f"Sample {i+1}")
    print("Context:", context)
    print("Generated Question:", result["question"])
    print("Confidence Score:", result["confidence_score"])
    print("-" * 100)
    results.append(result)

with open("advanced_generated_questions.json", "w") as f:
    json.dump(results, f, indent=4)

print("Saved results to 'advanced_generated_questions.json'")

Sample 1
Context: Symmetric encryption uses the same key for both encryption and decryption, while asymmetric encryption uses a pair of keys: a public key for encryption and a private key for decryption.
Generated Question: What is the difference between symmetric and asymmetric encryption?
Confidence Score: 0.985
----------------------------------------------------------------------------------------------------
Sample 2
Context: Non-maximum suppression is used to thin the edges by suppressing pixels that are not local maxima in the gradient direction, ensuring that only the most prominent edges are retained.
Generated Question: True or False: Non-maximum suppression thins edges by suppressing pixels that are not local maxima.
Confidence Score: 0.971
----------------------------------------------------------------------------------------------------
Sample 3
Context: The average of token vectors may lose important syntactic and semantic information, leading to less effective sentence 

# **Generate Exam Questions Using the QG Model for the Analasis Of algorithms Module**

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/SciFive_pubmedqa_Guestion_generation_finetuned")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/SciFive_pubmedqa_Guestion_generation_finetuned")

def generate_advanced_question(model, tokenizer, context, max_length=100, question_type="auto"):
    inputs = tokenizer(context, return_tensors="pt", padding='max_length', truncation=True, max_length=512)

    logits_processor = LogitsProcessorList([
        MinLengthLogitsProcessor(10, eos_token_id=tokenizer.eos_token_id)
    ])

    output = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_beams=5,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        return_dict_in_generate=True,
        output_scores=True,
        logits_processor=logits_processor
    )

    question = tokenizer.decode(output.sequences[0], skip_special_tokens=True)

    scores = output.scores
    if scores:
        probs = [torch.nn.functional.softmax(score, dim=-1).max().item() for score in scores]
        avg_confidence = sum(probs) / len(probs)
    else:
        avg_confidence = None

    return {
        "context": context,
        "question": question,
        "confidence_score": round(avg_confidence, 3) if avg_confidence else "N/A",
        "question_type": question_type,
        "question_id": str(uuid4())
    }

# Module-specific contexts
contexts = [
    {
        "context": "Asymptotic notations such as Big O, Big Theta, and Big Omega are used to analyze the performance of algorithms by describing their running time or space requirements as the input size grows."
    },
    {
        "context": "Recursive algorithms, such as the factorial function, can be analyzed by deriving recurrence relations and solving them using methods like Backward Substitution."
    },
    {
        "context": "The Greedy paradigm involves making locally optimal choices at each step to find a global optimum, as exemplified by Kruskal’s Algorithm for finding minimum spanning trees."
    },
    {
        "context": "Divide-and-Conquer algorithms, like Merge Sort, break a problem into smaller sub-problems, solve them recursively, and combine the solutions efficiently."
    },
    {
        "context": "Dynamic Programming improves the efficiency of algorithms by storing the results of sub-problems to avoid redundant computations, as seen in the Fibonacci sequence calculation."
    }
]

results = []
for i, item in enumerate(contexts[:5]):  # Generate 5 questions
    context = item["context"]
    result = generate_advanced_question(model, tokenizer, context, max_length=150)
    print(f"Sample {i+1}")
    print("Context:", context)
    print("Generated Question:", result["question"])
    print("Confidence Score:", result["confidence_score"])
    print("Question ID:", result["question_id"])
    print("-" * 100)
    results.append(result)

with open("advanced_generated_questions.json", "w") as f:
    json.dump(results, f, indent=4)

print("Saved results to 'advanced_generated_questions.json'")

Sample 1
Context: Asymptotic notations such as Big O, Big Theta, and Big Omega are used to analyze the performance of algorithms by describing their running time or space requirements as the input size grows.
Generated Question: What is the purpose of asymptotic notations such as Big O, Big Theta, and Big Omega?
Confidence Score: 0.984
Question ID: 42a2a83e-24b6-4be9-99e9-952e1b4b9fe8
----------------------------------------------------------------------------------------------------
Sample 2
Context: Recursive algorithms, such as the factorial function, can be analyzed by deriving recurrence relations and solving them using methods like Backward Substitution.
Generated Question: True or False: Recursive algorithms can be analyzed using recurrence relations.
Confidence Score: 0.969
Question ID: 5414f485-2331-4b33-86b9-5e544fa4ede1
----------------------------------------------------------------------------------------------------
Sample 3
Context: The Greedy paradigm involves making lo