In [135]:
!pip install -q transformers torch peft bert-score rouge-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [136]:
import json
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from bert_score import score
from rouge_score import rouge_scorer
import warnings
warnings.filterwarnings("ignore")

# Set environment variable for CUDA debugging
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [137]:
# Load model and tokenizer
model_name = "Salm00n/gpt2-xl_SATACT_v3"
print(f"Loading tokenizer for {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

print(f"Loading base GPT-2 XL model...")
base_model = AutoModelForCausalLM.from_pretrained("gpt2-xl")

print(f"Loading fine-tuned LoRA adapters for {model_name}...")
model = PeftModel.from_pretrained(base_model, model_name, adapter_name="default")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded successfully on {device}")

Loading tokenizer for Salm00n/gpt2-xl_SATACT_v3...


tokenizer_config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

Loading base GPT-2 XL model...
Loading fine-tuned LoRA adapters for Salm00n/gpt2-xl_SATACT_v3...


adapter_config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/82.2M [00:00<?, ?B/s]

Model loaded successfully on cuda


In [138]:
def generate_cot_response(prompt, max_new_tokens=150):
    torch.manual_seed(42)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
    stop_token_ids = tokenizer.encode("[END]", add_special_tokens=False)
    
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        do_sample=False,
        num_beams=4,
        no_repeat_ngram_size=3,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=stop_token_ids
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Define evaluation function
def evaluate_response(generated, reference):
    P, R, F1 = score([generated], [reference], lang="en", model_type="bert-base-uncased")
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {
        "BERTScore_F1": F1.item(),
        "ROUGE-1_F1": scores['rouge1'].fmeasure,
        "ROUGE-2_F1": scores['rouge2'].fmeasure,
        "ROUGE-L_F1": scores['rougeL'].fmeasure
    }


# Fill in the blank type Q's

In [139]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.\n"
    "[Question]: Which choice completes the text with the most logical and precise word or phrase?\n"
    "[Options]:\n"
    "A) attached\n"
    "B) collected\n"
    "C) followed\n"
    "D) replaced\n"
    "Step-by-step reasoning:\n"
    "Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx successfully collected a sample of 101955 Bennu.[END]\n\n"
    
    "[Current Problem]\n"
    "[Context]: A firefly uses specialized muscles to draw oxygen into its lower abdomen through narrow tubes, triggering a chemical reaction whereby the oxygen combines with chemicals in the firefly's abdomen to produce a glow. ____ when the firefly stops drawing in oxygen, the reaction—and the glow—cease.\n"
    "[Question]: Which choice completes the text with the most logical transition?\n"
    "[Options]:\n"
    "A) For instance,\n"
    "B) By contrast,\n"
    "C) Specifically,\n"
    "D) In conclusion,\n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 364 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.
[Question]: Which choice completes the text with the most logical and precise word or phrase?
[Options]:
A) attached
B) collected
C) followed
D) replaced
Step-by-step reasoning:
Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx s

In [140]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ("Choice B is the best answer. “By contrast” logically signals that the information in this sentence—that a firefly’s glow ceases when it stops drawing in oxygen contrasts with the previous sentence’s discussion of the processes that cause a firefly to begin to glow. "
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.541
ROUGE-1_F1: 0.281
ROUGE-2_F1: 0.032
ROUGE-L_F1: 0.188


In [141]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.\n"
    "[Question]: Which choice completes the text with the most logical and precise word or phrase?\n"
    "[Options]:\n"
    "A) attached\n"
    "B) collected\n"
    "C) followed\n"
    "D) replaced\n"
    "Step-by-step reasoning:\n"
    "Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx successfully collected a sample of 101955 Bennu.[END]\n\n"
    
    # "[Example 2]\n"
    # "[Context]: A firefly uses specialized muscles to draw oxygen into its lower abdomen through narrow tubes, triggering a chemical reaction whereby the oxygen combines with chemicals in the firefly's abdomen to produce a glow. ____ when the firefly stops drawing in oxygen, the reaction—and the glow—cease.\n"
    # "[Question]: Which choice completes the text with the most logical transition?\n"
    # "[Options]:\n"
    # "A) For instance,\n"
    # "B) By contrast,\n"
    # "C) Specifically,\n"
    # "D) In conclusion,\n"
    # "Step-by-step reasoning:\n"
    # "Choice B is the best answer. “By contrast” logically signals that the information in this sentence—that a firefly’s glow ceases when it stops drawing in oxygen contrasts with the previous sentence’s discussion of the processes that cause a firefly to begin to glow.[END]\n\n"

    "[Current Problem]\n"
    "[Context]: Research conducted by planetary scientist Katarina Miljkovic suggests that the Moon’s surface may not accurately _______ early impact events. When the Moon was still forming, its surface was softer, and asteroid or meteoroid impacts would have left less of an impression; thus, evidence of early impacts may no longer be present\n"
    "[Question]: Which choice completes the text with the most logical and precise word or phrase?\n"
    "[Options]:\n"
    "A) reflect\n"
    "B) receive \n"
    "C) evaluate\n"
    "D) mimic \n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 370 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.
[Question]: Which choice completes the text with the most logical and precise word or phrase?
[Options]:
A) attached
B) collected
C) followed
D) replaced
Step-by-step reasoning:
Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx s

In [142]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ("Choice A is the best answer because it most logically completes the text’s discussion of the Moon’s surface. In this context, “reflect” means show or make apparent. The text states that because the surface of the Moon was softer when the Moon was still forming than it is now, early asteroid and meteoroid impacts “would have left less of an impression” and, as a result, evidence of them may no longer exist. This context supports the idea that the surface of the Moon may not accurately show signs of early impact events."
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.526
ROUGE-1_F1: 0.275
ROUGE-2_F1: 0.078
ROUGE-L_F1: 0.229


In [143]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.\n"
    "[Question]: Which choice completes the text with the most logical and precise word or phrase?\n"
    "[Options]:\n"
    "A) attached\n"
    "B) collected\n"
    "C) followed\n"
    "D) replaced\n"
    "Step-by-step reasoning:\n"
    "Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx successfully collected a sample of 101955 Bennu.[END]\n\n"
    
    # "[Example 2]\n"
    # "[Context]: A firefly uses specialized muscles to draw oxygen into its lower abdomen through narrow tubes, triggering a chemical reaction whereby the oxygen combines with chemicals in the firefly's abdomen to produce a glow. ____ when the firefly stops drawing in oxygen, the reaction—and the glow—cease.\n"
    # "[Question]: Which choice completes the text with the most logical transition?\n"
    # "[Options]:\n"
    # "A) For instance,\n"
    # "B) By contrast,\n"
    # "C) Specifically,\n"
    # "D) In conclusion,\n"
    # "Step-by-step reasoning:\n"
    # "Choice B is the best answer. “By contrast” logically signals that the information in this sentence—that a firefly’s glow ceases when it stops drawing in oxygen contrasts with the previous sentence’s discussion of the processes that cause a firefly to begin to glow.[END]\n\n"

    "[Current Problem]\n"
    "[Context]: In 2008, two years after the death of science fiction writer Octavia Butler, the Huntington Library in   [BLANK]   received a collection of more than 8,000 items, including Butler’s private notes, research materials, manuscripts, photos, and drawings. Today, the Octavia E. Butler Collection is one of the most researched archives at the library.\n"
    "[Question]: Which choice completes the text so that it conforms to the conventions of Standard English?\n"
    "[Options]:\n"
    "A) California,\n"
    "B) California: \n"
    "C) California—\n"
    "D) California \n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 386 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.
[Question]: Which choice completes the text with the most logical and precise word or phrase?
[Options]:
A) attached
B) collected
C) followed
D) replaced
Step-by-step reasoning:
Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx s

In [144]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ("Choice D is the best answer because it completes the sentence in accordance with Standard English conventions. The phrase “the Huntington Library in California” uses “California” without additional punctuation, which is clear and grammatically correct. The prepositional phrase “in California” directly specifies the library’s location without needing commas, colons, or dashes."
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.514
ROUGE-1_F1: 0.211
ROUGE-2_F1: 0.022
ROUGE-L_F1: 0.168


In [145]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.\n"
    "[Question]: Which choice completes the text with the most logical and precise word or phrase?\n"
    "[Options]:\n"
    "A) attached\n"
    "B) collected\n"
    "C) followed\n"
    "D) replaced\n"
    "Step-by-step reasoning:\n"
    "Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx successfully collected a sample of 101955 Bennu.[END]\n\n"
    
    # "[Example 2]\n"
    # "[Context]: A firefly uses specialized muscles to draw oxygen into its lower abdomen through narrow tubes, triggering a chemical reaction whereby the oxygen combines with chemicals in the firefly's abdomen to produce a glow. ____ when the firefly stops drawing in oxygen, the reaction—and the glow—cease.\n"
    # "[Question]: Which choice completes the text with the most logical transition?\n"
    # "[Options]:\n"
    # "A) For instance,\n"
    # "B) By contrast,\n"
    # "C) Specifically,\n"
    # "D) In conclusion,\n"
    # "Step-by-step reasoning:\n"
    # "Choice B is the best answer. “By contrast” logically signals that the information in this sentence—that a firefly’s glow ceases when it stops drawing in oxygen contrasts with the previous sentence’s discussion of the processes that cause a firefly to begin to glow.[END]\n\n"

    "[Current Problem]\n"
    "[Context]: Slam poet Elizabeth Acevedo’s debut novel The Poet X, winner of the 2018 National Book Award for Young People’s Literature, is composed of ____ protagonist, fifteen-year-old Xiomara Batista.\n"
    "[Question]: Which choice completes the text so that it conforms to the conventions of Standard English? \n"
    "[Options]:\n"
    "A) poems putatively written by the novel’s\n"
    "B) poem’s putatively written by the novel’s \n"
    "C) poem’s putatively written by the novels’ \n"
    "D) poems putatively written by the novels’ \n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 397 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.
[Question]: Which choice completes the text with the most logical and precise word or phrase?
[Options]:
A) attached
B) collected
C) followed
D) replaced
Step-by-step reasoning:
Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx s

In [146]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ("Choice A is the best answer because it correctly completes the sentence per Standard English conventions. The phrase “poems putatively written by the novel’s” uses the plural “poems” to reflect the novel’s poetry-based structure, “putatively” to indicate attribution to the protagonist, and “novel’s” for the singular novel, The Poet X. Choices B and C wrongly use singular “poem’s,” and C and D incorrectly use “novels’,” suggesting multiple novels. Choice A is grammatically correct and contextually fitting."
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.546
ROUGE-1_F1: 0.292
ROUGE-2_F1: 0.053
ROUGE-L_F1: 0.208


In [147]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.\n"
    "[Question]: Which choice completes the text with the most logical and precise word or phrase?\n"
    "[Options]:\n"
    "A) attached\n"
    "B) collected\n"
    "C) followed\n"
    "D) replaced\n"
    "Step-by-step reasoning:\n"
    "Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx successfully collected a sample of 101955 Bennu.[END]\n\n"
    
    # "[Example 2]\n"
    # "[Context]: A firefly uses specialized muscles to draw oxygen into its lower abdomen through narrow tubes, triggering a chemical reaction whereby the oxygen combines with chemicals in the firefly's abdomen to produce a glow. ____ when the firefly stops drawing in oxygen, the reaction—and the glow—cease.\n"
    # "[Question]: Which choice completes the text with the most logical transition?\n"
    # "[Options]:\n"
    # "A) For instance,\n"
    # "B) By contrast,\n"
    # "C) Specifically,\n"
    # "D) In conclusion,\n"
    # "Step-by-step reasoning:\n"
    # "Choice B is the best answer. “By contrast” logically signals that the information in this sentence—that a firefly’s glow ceases when it stops drawing in oxygen contrasts with the previous sentence’s discussion of the processes that cause a firefly to begin to glow.[END]\n\n"

    "[Current Problem]\n"
    "[Context]: Eighteen letters written by Louisa May Alcott, author of the popular novel Little Women (1868), can be found at the New York Historical Society.   [BLANK]   letters demonstrate Alcott’s keen business sense in her interactions with publishers\n"
    "[Question]: Which choice completes the text so that it conforms to the conventions of Standard English? \n"
    "[Options]:\n"
    "A) One\n"
    "B) That \n"
    "C) This \n"
    "D) These \n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 363 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.
[Question]: Which choice completes the text with the most logical and precise word or phrase?
[Options]:
A) attached
B) collected
C) followed
D) replaced
Step-by-step reasoning:
Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx s

In [148]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip() 
reference_cot = ("Choice D is the best answer. The convention being tested is the use of determiners in a sentence. The plural determiner “these” agrees in number with the plural noun “letters” that it modifies. This choice clearly indicates that the letters demonstrate Alcott’s business sense. "
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.521
ROUGE-1_F1: 0.310
ROUGE-2_F1: 0.029
ROUGE-L_F1: 0.197


In [149]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.\n"
    "[Question]: Which choice completes the text with the most logical and precise word or phrase?\n"
    "[Options]:\n"
    "A) attached\n"
    "B) collected\n"
    "C) followed\n"
    "D) replaced\n"
    "Step-by-step reasoning:\n"
    "Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx successfully collected a sample of 101955 Bennu.[END]\n\n"
    
    # "[Example 2]\n"
    # "[Context]: A firefly uses specialized muscles to draw oxygen into its lower abdomen through narrow tubes, triggering a chemical reaction whereby the oxygen combines with chemicals in the firefly's abdomen to produce a glow. ____ when the firefly stops drawing in oxygen, the reaction—and the glow—cease.\n"
    # "[Question]: Which choice completes the text with the most logical transition?\n"
    # "[Options]:\n"
    # "A) For instance,\n"
    # "B) By contrast,\n"
    # "C) Specifically,\n"
    # "D) In conclusion,\n"
    # "Step-by-step reasoning:\n"
    # "Choice B is the best answer. “By contrast” logically signals that the information in this sentence—that a firefly’s glow ceases when it stops drawing in oxygen contrasts with the previous sentence’s discussion of the processes that cause a firefly to begin to glow.[END]\n\n"

    "[Current Problem]\n"
    "[Context]: In her two major series “Memory Test” and “Autobiography,” painter Howardena Pindell explored themes   [BLANK]   healing, self-discovery, and memory by cutting and sewing back together pieces of canvas and inserting personal artifacts, such as postcards, into some of the paintings.\n"
    "[Question]: Which choice completes the text so that it conforms to the conventions of Standard English? \n"
    "[Options]:\n"
    "A) of \n"
    "B) of,\n"
    "C) of— \n"
    "D) of: \n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 382 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.
[Question]: Which choice completes the text with the most logical and precise word or phrase?
[Options]:
A) attached
B) collected
C) followed
D) replaced
Step-by-step reasoning:
Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx s

In [150]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip() 
reference_cot = ("Choice A is the best answer. The convention being tested is punctuation between a preposition and its complement. No punctuation is needed between the preposition “of” and its complement, the noun phrase “healing, self-discovery, and memory."
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.527
ROUGE-1_F1: 0.351
ROUGE-2_F1: 0.036
ROUGE-L_F1: 0.281


In [151]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.\n"
    "[Question]: Which choice completes the text with the most logical and precise word or phrase?\n"
    "[Options]:\n"
    "A) attached\n"
    "B) collected\n"
    "C) followed\n"
    "D) replaced\n"
    "Step-by-step reasoning:\n"
    "Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx successfully collected a sample of 101955 Bennu.[END]\n\n"
    
    "[Example 2]\n"
    "[Context]: A firefly uses specialized muscles to draw oxygen into its lower abdomen through narrow tubes, triggering a chemical reaction whereby the oxygen combines with chemicals in the firefly's abdomen to produce a glow. ____ when the firefly stops drawing in oxygen, the reaction—and the glow—cease.\n"
    "[Question]: Which choice completes the text with the most logical transition?\n"
    "[Options]:\n"
    "A) For instance,\n"
    "B) By contrast,\n"
    "C) Specifically,\n"
    "D) In conclusion,\n"
    "Step-by-step reasoning:\n"
    "Choice B is the best answer. “By contrast” logically signals that the information in this sentence—that a firefly’s glow ceases when it stops drawing in oxygen contrasts with the previous sentence’s discussion of the processes that cause a firefly to begin to glow.[END]\n\n"

    "[Current Problem]\n"
    "[Context]: Barring major archaeological discoveries, we are unlikely to ever have  ____  account of ancient Egypt under the female pharaoh Hatshepsut, as much of the evidence of her reign was deliberately destroyed by her successors.\n"
    "[Question]: Which choice completes the text with the most logical and precise word or phrase? \n"
    "[Options]:\n"
    "A) an imaginative \n"
    "B) a superficial\n"
    "C) an exhaustive\n"
    "D) a questionable\n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 534 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.
[Question]: Which choice completes the text with the most logical and precise word or phrase?
[Options]:
A) attached
B) collected
C) followed
D) replaced
Step-by-step reasoning:
Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx s

In [152]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip() 
reference_cot = ("Choice C is the best answer because it most logically completes the text’s discussion of historical evidence about ancient Egypt under the reign of the pharaoh Hatshepsut. In this context, “an exhaustive” account would be a thorough one. The text states that much of the evidence from her reign was purposely destroyed—in other words, there is a lack of surviving records. This context conveys that unless there are major new archaeological discoveries, an exhaustive account of Hatshepsut’s reign is unlikely."
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.556
ROUGE-1_F1: 0.306
ROUGE-2_F1: 0.052
ROUGE-L_F1: 0.204


In [153]:

# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.\n"
    "[Question]: Which choice completes the text with the most logical and precise word or phrase?\n"
    "[Options]:\n"
    "A) attached\n"
    "B) collected\n"
    "C) followed\n"
    "D) replaced\n"
    "Step-by-step reasoning:\n"
    "Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx successfully collected a sample of 101955 Bennu.[END]\n\n"
    
    "[Example 2]\n"
    "[Context]: A firefly uses specialized muscles to draw oxygen into its lower abdomen through narrow tubes, triggering a chemical reaction whereby the oxygen combines with chemicals in the firefly's abdomen to produce a glow. ____ when the firefly stops drawing in oxygen, the reaction—and the glow—cease.\n"
    "[Question]: Which choice completes the text with the most logical transition?\n"
    "[Options]:\n"
    "A) For instance,\n"
    "B) By contrast,\n"
    "C) Specifically,\n"
    "D) In conclusion,\n"
    "Step-by-step reasoning:\n"
    "Choice B is the best answer. “By contrast” logically signals that the information in this sentence—that a firefly’s glow ceases when it stops drawing in oxygen contrasts with the previous sentence’s discussion of the processes that cause a firefly to begin to glow.[END]\n\n"

    "[Current Problem]\n"
    "[Context]: Painter Alma W. Thomas was fascinated by the colors and shapes found in nature. The flowers and trees in the garden at her home in Washington, DC,   [BLANK]   her work. For example, Thomas’s use of broken brushstrokes was inspired by the way that light would shine through the leaves of a tree in front of her house.\n"
    "[Question]: Which choice completes the text with the most logical and precise word or phrase?\n"
    "[Options]:\n"
    "A) restricted \n"
    "B) announced\n"
    "C) distracted\n"
    "D) influenced \n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 561 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: The spacecraft OSIRIS-REx briefly made contact with the asteroid 101955 Bennu in 2020. NASA scientist Daniella DellaGiustina reports that despite facing the unexpected obstacle of a surface mostly covered in boulders, OSIRIS-REx successfully ____ a sample of the surface, gathering pieces of it to bring back to Earth.
[Question]: Which choice completes the text with the most logical and precise word or phrase?
[Options]:
A) attached
B) collected
C) followed
D) replaced
Step-by-step reasoning:
Choice B is the best answer because it most logically completes the text’s discussion of the OSIRIS-REx spacecraft’s contact with the asteroid 101955 Bennu. In this context, “collected” means acquired and took away. The text indicates that although the boulders on the asteroid’s surface caused some unforeseen problems, OSIRIS-REx was able to gather a sample to return to Earth. This context suggests that OSIRIS-REx s

In [154]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip() 
reference_cot = ("Choice C is the best answer because it most logically completes the text’s discussion of historical evidence about ancient Egypt under the reign of the pharaoh Hatshepsut. In this context, “an exhaustive” account would be a thorough one. The text states that much of the evidence from her reign was purposely destroyed—in other words, there is a lack of surviving records. This context conveys that unless there are major new archaeological discoveries, an exhaustive account of Hatshepsut’s reign is unlikely."
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.503
ROUGE-1_F1: 0.184
ROUGE-2_F1: 0.042
ROUGE-L_F1: 0.122


# Summary based Questions

In [155]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: By combining Indigenous and classical music, Cree composer and cellist Cris Derksen creates works that reflect the diverse cultural landscape of Canada. For her album Orchestral Powwow, Derksen composed new songs in the style of traditional powwow music that were accompanied by classical arrangements played by an orchestra. But where an orchestra would normally follow the directions of a conductor, the musicians on Orchestral Powwow are led by the beat of a powwow drum.\n"
    "[Question]: Which choice best states the main purpose of the text?\n"
    "[Options]:\n"
    "A) To examine how Derksen’s musical compositions blend cultures\n"
    "B) To argue that Derksen should be recognized for creating a new style of music\n"
    "C) To describe the difficulties Derksen encountered when producing her album\n"
    "D) To establish a contrast between Derksen’s classical training and her Cree heritage\n"
    "Step-by-step reasoning:\n"
    "Choice A is the best answer because it accurately captures the main purpose of the text. The text describes how Cris Derksen combines Indigenous powwow music with classical arrangements in her album Orchestral Powwow, highlighting her use of a powwow drum to lead the orchestra. This illustrates her blending of Indigenous and classical musical traditions to reflect Canada’s diverse cultural landscape.[END]\n\n"
    
    "[Current Problem]\n"
    "[Context]: The following text is adapted from Paul Laurence Dunbar’s 1902 novel The Sport of the Gods. Joe and some of his family members have recently moved to New York City. [Joe] was wild with enthusiasm and with a desire to be a part of all that the metropolis meant. In the evening he saw the young fellows passing by dressed in their spruce clothes, and he wondered with a sort of envy where they could be going. Back home there had been no place much worth going to, except church and one or two people’s houses.\n"
    "[Question]:Which choice best states the main purpose of the text?\n"
    "[Options]: \n"
    "A) It illustrates a character’s reaction to a new environment.\n"
    "B) It explains why a character has traveled to a city.\n"
    "C) It compares a character’s thoughts about an event at two different times of day.\n"
    "D) It presents a character feeling regret over leaving home. \n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 495 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: By combining Indigenous and classical music, Cree composer and cellist Cris Derksen creates works that reflect the diverse cultural landscape of Canada. For her album Orchestral Powwow, Derksen composed new songs in the style of traditional powwow music that were accompanied by classical arrangements played by an orchestra. But where an orchestra would normally follow the directions of a conductor, the musicians on Orchestral Powwow are led by the beat of a powwow drum.
[Question]: Which choice best states the main purpose of the text?
[Options]:
A) To examine how Derksen’s musical compositions blend cultures
B) To argue that Derksen should be recognized for creating a new style of music
C) To describe the difficulties Derksen encountered when producing her album
D) To establish a contrast between Derksen’s classical training and her Cree heritage
Step-by-step reasoning:
Choice A is the best answer beca

In [156]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ("Choice A is the best answer because it accurately captures the text’s main purpose. The text highlights Joe’s enthusiasm and curiosity about New York City, noting his observations of well-dressed young men and contrasting this with the limited social options back home. This shows the text’s focus on illustrating Joe’s reaction to his new urban environment, not his reasons for moving, thoughts at different times, or regret about leaving home."
                )

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.510
ROUGE-1_F1: 0.226
ROUGE-2_F1: 0.019
ROUGE-L_F1: 0.151


In [158]:
cot_prompt = (
    "[Example 1]\n"
    "[Context]: While researching a topic, a student has taken the following notes: Some powerful works of literature have so influenced readers that new legislation has been passed as a result. The Interesting Narrative of the Life of Olaudah Equiano (1789) is the autobiography of a man who endured slavery on both sides of the Atlantic. Equiano’s book contributed to the passage of the Slave Trade Act of 1807. The Jungle (1906) is a fictional work by Upton Sinclair that describes unsanitary conditions in US meatpacking plants. Sinclair’s book contributed to the passage of the Pure Food and Drug Act in 1906.\n"
    "[Question]: The student wants to emphasize a difference between the two books. Which choice most effectively uses relevant information from the notes to accomplish this goal?\n"
    "[Options]:\n"
    "A) Although both are powerful works of literature that contributed to new legislation, Equiano’s book is an autobiography, while Sinclair’s is fictional.\n"
    "B) They may have written about different topics, but Equiano and Sinclair both influenced readers.\n"
    "C) The 1807 Slave Trade Act resulted in part from a book by Equiano, while the 1906 Pure Food and Drug Act resulted in part from a book by Sinclair.\n"
    "D) The Interesting Narrative of the Life of Olaudah Equiano and The Jungle are two works of literature that contributed to new legislation (concerning the slave trade and food safety, respectively).\n"
    "Step-by-step reasoning:\n"
    "Choice B is the best answer because it effectively emphasizes a difference between Equiano’s The Interesting Narrative of the Life of Olaudah Equiano and Sinclair’s The Jungle by noting they “written about different topics”—slavery versus unsanitary meatpacking conditions—as per the notes. This directly highlights a key distinction in their subject matter, aligning with the student’s goal.[END]\n\n"


    "[Current Problem]\n"
    "[Context]: By combining Indigenous and classical music, Cree composer and cellist Cris Derksen creates works that reflect the diverse cultural landscape of Canada. For her album Orchestral Powwow, Derksen composed new songs in the style of traditional powwow music that were accompanied by classical arrangements played by an orchestra. But where an orchestra would normally follow the directions of a conductor, the musicians on Orchestral Powwow are led by the beat of a powwow drum.\n"
    "[Question]: Which choice best states the main purpose of the text?\n"
    "[Options]:\n"
    "A) To examine how Derksen’s musical compositions blend cultures\n"
    "B) To argue that Derksen should be recognized for creating a new style of music\n"
    "C) To describe the difficulties Derksen encountered when producing her album\n"
    "D) To establish a contrast between Derksen’s classical training and her Cree heritage\n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 601 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: While researching a topic, a student has taken the following notes: Some powerful works of literature have so influenced readers that new legislation has been passed as a result. The Interesting Narrative of the Life of Olaudah Equiano (1789) is the autobiography of a man who endured slavery on both sides of the Atlantic. Equiano’s book contributed to the passage of the Slave Trade Act of 1807. The Jungle (1906) is a fictional work by Upton Sinclair that describes unsanitary conditions in US meatpacking plants. Sinclair’s book contributed to the passage of the Pure Food and Drug Act in 1906.
[Question]: The student wants to emphasize a difference between the two books. Which choice most effectively uses relevant information from the notes to accomplish this goal?
[Options]:
A) Although both are powerful works of literature that contributed to new legislation, Equiano’s book is an autobiography, while Si

In [159]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ("Choice A is the best answer because it accurately captures the main purpose of the text. The text describes how Cris Derksen combines Indigenous powwow music with classical arrangements in her album Orchestral Powwow, highlighting her use of a powwow drum to lead the orchestra. This illustrates her blending of Indigenous and classical musical traditions to reflect Canada’s diverse cultural landscape."
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.542
ROUGE-1_F1: 0.319
ROUGE-2_F1: 0.036
ROUGE-L_F1: 0.195


In [160]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: By combining Indigenous and classical music, Cree composer and cellist Cris Derksen creates works that reflect the diverse cultural landscape of Canada. For her album Orchestral Powwow, Derksen composed new songs in the style of traditional powwow music that were accompanied by classical arrangements played by an orchestra. But where an orchestra would normally follow the directions of a conductor, the musicians on Orchestral Powwow are led by the beat of a powwow drum.\n"
    "[Question]: Which choice best states the main purpose of the text?\n"
    "[Options]:\n"
    "A) To examine how Derksen’s musical compositions blend cultures\n"
    "B) To argue that Derksen should be recognized for creating a new style of music\n"
    "C) To describe the difficulties Derksen encountered when producing her album\n"
    "D) To establish a contrast between Derksen’s classical training and her Cree heritage\n"
    "Step-by-step reasoning:\n"
    "Choice A is the best answer because it accurately captures the main purpose of the text. The text describes how Cris Derksen combines Indigenous powwow music with classical arrangements in her album Orchestral Powwow, highlighting her use of a powwow drum to lead the orchestra. This illustrates her blending of Indigenous and classical musical traditions to reflect Canada’s diverse cultural landscape.[END]\n\n"
    
    "[Current Problem]\n"
    "[Context]: While researching a topic, a student has taken the following notes: Some powerful works of literature have so influenced readers that new legislation has been passed as a result. The Interesting Narrative of the Life of Olaudah Equiano (1789) is the autobiography of a man who endured slavery on both sides of the Atlantic. Equiano’s book contributed to the passage of the Slave Trade Act of 1807. The Jungle (1906) is a fictional work by Upton Sinclair that describes unsanitary conditions in US meatpacking plants. Sinclair’s book contributed to the passage of the Pure Food and Drug Act in 1906.\n"
    "[Question]: The student wants to emphasize a difference between the two books. Which choice most effectively uses relevant information from the notes to accomplish this goal?\n"
    "[Options]:\n"
    "A) Although both are powerful works of literature that contributed to new legislation, Equiano’s book is an autobiography, while Sinclair’s is fictional.\n"
    "B) They may have written about different topics, but Equiano and Sinclair both influenced readers.\n"
    "C) The 1807 Slave Trade Act resulted in part from a book by Equiano, while the 1906 Pure Food and Drug Act resulted in part from a book by Sinclair.\n"
    "D) The Interesting Narrative of the Life of Olaudah Equiano and The Jungle are two works of literature that contributed to new legislation (concerning the slave trade and food safety, respectively).\n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 589 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: By combining Indigenous and classical music, Cree composer and cellist Cris Derksen creates works that reflect the diverse cultural landscape of Canada. For her album Orchestral Powwow, Derksen composed new songs in the style of traditional powwow music that were accompanied by classical arrangements played by an orchestra. But where an orchestra would normally follow the directions of a conductor, the musicians on Orchestral Powwow are led by the beat of a powwow drum.
[Question]: Which choice best states the main purpose of the text?
[Options]:
A) To examine how Derksen’s musical compositions blend cultures
B) To argue that Derksen should be recognized for creating a new style of music
C) To describe the difficulties Derksen encountered when producing her album
D) To establish a contrast between Derksen’s classical training and her Cree heritage
Step-by-step reasoning:
Choice A is the best answer beca

In [161]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ("Choice B is the best answer because it effectively emphasizes a difference between Equiano’s The Interesting Narrative of the Life of Olaudah Equiano and Sinclair’s The Jungle by noting they “written about different topics”—slavery versus unsanitary meatpacking conditions—as per the notes. This directly highlights a key distinction in their subject matter, aligning with the student’s goal."
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.461
ROUGE-1_F1: 0.144
ROUGE-2_F1: 0.000
ROUGE-L_F1: 0.090


In [162]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: By combining Indigenous and classical music, Cree composer and cellist Cris Derksen creates works that reflect the diverse cultural landscape of Canada. For her album Orchestral Powwow, Derksen composed new songs in the style of traditional powwow music that were accompanied by classical arrangements played by an orchestra. But where an orchestra would normally follow the directions of a conductor, the musicians on Orchestral Powwow are led by the beat of a powwow drum.\n"
    "[Question]: Which choice best states the main purpose of the text?\n"
    "[Options]:\n"
    "A) To examine how Derksen’s musical compositions blend cultures\n"
    "B) To argue that Derksen should be recognized for creating a new style of music\n"
    "C) To describe the difficulties Derksen encountered when producing her album\n"
    "D) To establish a contrast between Derksen’s classical training and her Cree heritage\n"
    "Step-by-step reasoning:\n"
    "Choice A is the best answer because it accurately captures the main purpose of the text. The text describes how Cris Derksen combines Indigenous powwow music with classical arrangements in her album Orchestral Powwow, highlighting her use of a powwow drum to lead the orchestra. This illustrates her blending of Indigenous and classical musical traditions to reflect Canada’s diverse cultural landscape.[END]\n\n"
    
    "[Current Problem]\n"
    "[Context]: The following text is adapted from Gwendolyn Bennett's 1926 poem 'Street Lamps in Early Spring.' 'Night wears a garment All velvet soft, all violet blue ... And over her face she draws a veil As shimmering fine as floating dew ... And here and there In the black of her hair The subtle hands of Night Move slowly with their gem-starred light'.\n"
    "[Question]: Which choice best describes the overall structure of the text?\n"
    "[Options]:\n"
    "A) It presents alternating descriptions of night in a rural area and in a city.\n"
    "B) It sketches an image of nightfall, then an image of sunrise.\n"
    "C) It makes an extended comparison of night to a human being.\n"
    "D) It portrays how night changes from one season of the year to the next.\n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 458 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: By combining Indigenous and classical music, Cree composer and cellist Cris Derksen creates works that reflect the diverse cultural landscape of Canada. For her album Orchestral Powwow, Derksen composed new songs in the style of traditional powwow music that were accompanied by classical arrangements played by an orchestra. But where an orchestra would normally follow the directions of a conductor, the musicians on Orchestral Powwow are led by the beat of a powwow drum.
[Question]: Which choice best states the main purpose of the text?
[Options]:
A) To examine how Derksen’s musical compositions blend cultures
B) To argue that Derksen should be recognized for creating a new style of music
C) To describe the difficulties Derksen encountered when producing her album
D) To establish a contrast between Derksen’s classical training and her Cree heritage
Step-by-step reasoning:
Choice A is the best answer beca

In [163]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice C is the best answer because it most accurately describes the overall structure of the text. Throughout the text, the speaker characterizes nighttime as if it were a person who wears clothing (“a garment” that is “velvet soft” and “violet blue”) and a veil “over her face” and who moves her hands “slowly with their gem-starred light” through her dark hair. Thus, the text is structured as an extended comparison of night to a human being.'''
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.521
ROUGE-1_F1: 0.311
ROUGE-2_F1: 0.040
ROUGE-L_F1: 0.194


# Replace portion of the text type of Qs

In [164]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: The following text is adapted from Cynthia Kadohata's 2004 novel Kira-Kira. [Uncle Katsuhisa] was as loud as my father was quiet. Even when he wasn't talking, he made a lot of noise, clearing his throat and sniffing and tapping his fingers.\n"
    '''[Question]: Which choice best describes the function of this "[Uncle Katsuhisa] was as loud as my father was quiet. " portion of the sentence?\n'''
    "[Options]:\n"
    "A) It lists the kinds of topics Uncle Katsuhisa enjoys discussing.\n"
    "B) It suggests that Uncle Katsuhisa dislikes meeting new people.\n"
    "C) It contrasts Uncle Katsuhisa with the narrator's father.\n"
    "D) It describes a conversation between the narrator and the narrator's father.\n"
    "Step-by-step reasoning:\n"
    "Choice C is the best answer because it most accurately describes how the underlined sentence functions in the text as a whole. The underlined sentence establishes a difference between Uncle Katsuhisa and the narrator’s father by describing Uncle Katsuhisa as “loud” and the narrator’s father as “quiet.” The text then elaborates on that contrast, describing some ways Uncle Katsuhisa is very noisy even when he isn’t speaking.[END]\n\n"
    
    "[Current Problem]\n"
    "[Context]: Researchers have found a nearly 164,000-year-old molar from a member of the archaic human species known as Denisovans in a cave in Laos, suggesting that Denisovans lived in a wider range of environments than indicated by earlier evidence. Before the discovery, Denisovans were thought to have lived only at high altitudes in relatively cold climates in what are now Russia and China, but the discovery of the tooth in Laos suggests that they may have lived at low altitudes in relatively warm climates in Southeast Asia as well.\n"
    "[Question]: Which choice best states the function of the portion in the text, “Before the discovery, Denisovans were thought to have lived only at high altitudes in relatively cold climates in what are now Russia and China” as a whole??\n"
    "[Options]:\n"
    "A) It dismisses as untrue the research presented in the previous sentence..\n"
    "B) It defines a term used in the description that follows in the rest of the sentence.\n"
    "C) It emphasizes the main goal of the research introduced in the previous sentence.\n"
    "D) It provides context that clarifies the significance of the information that follows in the rest of the sentence.\n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 537 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: The following text is adapted from Cynthia Kadohata's 2004 novel Kira-Kira. [Uncle Katsuhisa] was as loud as my father was quiet. Even when he wasn't talking, he made a lot of noise, clearing his throat and sniffing and tapping his fingers.
[Question]: Which choice best describes the function of this "[Uncle Katsuhisa] was as loud as my father was quiet. " portion of the sentence?
[Options]:
A) It lists the kinds of topics Uncle Katsuhisa enjoys discussing.
B) It suggests that Uncle Katsuhisa dislikes meeting new people.
C) It contrasts Uncle Katsuhisa with the narrator's father.
D) It describes a conversation between the narrator and the narrator's father.
Step-by-step reasoning:
Choice C is the best answer because it most accurately describes how the underlined sentence functions in the text as a whole. The underlined sentence establishes a difference between Uncle Katsuhisa and the narrator’s father 

In [165]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice D is the best answer because it accurately captures the function of the underlined portion. The text, “Before the discovery, Denisovans were thought to have lived only at high altitudes in relatively cold climates in what are now Russia and China,” provides background on prior beliefs about Denisovans’ habitats. This context clarifies the significance of the Laos discovery, which suggests they also lived in warmer, low-altitude areas, expanding their known range, rather than dismissing research, defining terms, or stating the research’s goal.'''
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.457
ROUGE-1_F1: 0.232
ROUGE-2_F1: 0.015
ROUGE-L_F1: 0.145


In [166]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: The following text is adapted from Cynthia Kadohata's 2004 novel Kira-Kira. [Uncle Katsuhisa] was as loud as my father was quiet. Even when he wasn't talking, he made a lot of noise, clearing his throat and sniffing and tapping his fingers.\n"
    '''[Question]: Which choice best describes the function of this "[Uncle Katsuhisa] was as loud as my father was quiet. " portion of the sentence?\n'''
    "[Options]:\n"
    "A) It lists the kinds of topics Uncle Katsuhisa enjoys discussing.\n"
    "B) It suggests that Uncle Katsuhisa dislikes meeting new people.\n"
    "C) It contrasts Uncle Katsuhisa with the narrator's father.\n"
    "D) It describes a conversation between the narrator and the narrator's father.\n"
    "Step-by-step reasoning:\n"
    "Choice C is the best answer because it most accurately describes how the underlined sentence functions in the text as a whole. The underlined sentence establishes a difference between Uncle Katsuhisa and the narrator’s father by describing Uncle Katsuhisa as “loud” and the narrator’s father as “quiet.” The text then elaborates on that contrast, describing some ways Uncle Katsuhisa is very noisy even when he isn’t speaking.[END]\n\n"
    
    "[Current Problem]\n"
    "[Context]: The following text is from Joan Didion’s memoir The Year of Magical Thinking. In the text, the author discusses her home life. [I]n California we heated our houses by buildingfires. We built fires even on summer evenings, because the fog came in. Fires said we were home, we had drawn the circle, we were safe through the night.\n"
    "[Question]: Which choice best describes the function of this 'Fires said we were home, we had drawn the circle, we were safe through the night.' portion in the text as a whole?\n"
    "[Options]:\n"
    "A) It illustrates that a fire provides comfort beyondphysical warmth.\n"
    "B) It summarizes the information that came before it in the text.\n"
    "C) It explains that the house remains cold even insummer.\n"
    "D) It suggests that the author feels comfortable inher home with or without a fire.\n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 477 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: The following text is adapted from Cynthia Kadohata's 2004 novel Kira-Kira. [Uncle Katsuhisa] was as loud as my father was quiet. Even when he wasn't talking, he made a lot of noise, clearing his throat and sniffing and tapping his fingers.
[Question]: Which choice best describes the function of this "[Uncle Katsuhisa] was as loud as my father was quiet. " portion of the sentence?
[Options]:
A) It lists the kinds of topics Uncle Katsuhisa enjoys discussing.
B) It suggests that Uncle Katsuhisa dislikes meeting new people.
C) It contrasts Uncle Katsuhisa with the narrator's father.
D) It describes a conversation between the narrator and the narrator's father.
Step-by-step reasoning:
Choice C is the best answer because it most accurately describes how the underlined sentence functions in the text as a whole. The underlined sentence establishes a difference between Uncle Katsuhisa and the narrator’s father 

In [167]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice A is the best answer because it most accurately describes how the underlined portion functions in the text as a whole. The first two sentences of the text establish that in California, houses were heated by building fires year-round. The underlined portion then indicates that the fires didn’t merely provide physical warmth: they also represented being “home” and feeling protected. The underlined portion thus illustrates that a fire provides comfort beyond physical warmth. '''
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.565
ROUGE-1_F1: 0.298
ROUGE-2_F1: 0.065
ROUGE-L_F1: 0.213


In [168]:
# Define CoT prompt with one-shot example
cot_prompt = (
    "[Example 1]\n"
    "[Context]: The following text is adapted from Cynthia Kadohata's 2004 novel Kira-Kira. [Uncle Katsuhisa] was as loud as my father was quiet. Even when he wasn't talking, he made a lot of noise, clearing his throat and sniffing and tapping his fingers.\n"
    '''[Question]: Which choice best describes the function of this "[Uncle Katsuhisa] was as loud as my father was quiet. " portion of the sentence?\n'''
    "[Options]:\n"
    "A) It lists the kinds of topics Uncle Katsuhisa enjoys discussing.\n"
    "B) It suggests that Uncle Katsuhisa dislikes meeting new people.\n"
    "C) It contrasts Uncle Katsuhisa with the narrator's father.\n"
    "D) It describes a conversation between the narrator and the narrator's father.\n"
    "Step-by-step reasoning:\n"
    "Choice C is the best answer because it most accurately describes how the underlined sentence functions in the text as a whole. The underlined sentence establishes a difference between Uncle Katsuhisa and the narrator’s father by describing Uncle Katsuhisa as “loud” and the narrator’s father as “quiet.” The text then elaborates on that contrast, describing some ways Uncle Katsuhisa is very noisy even when he isn’t speaking.[END]\n\n"
    
    "[Current Problem]\n"
    "[Context]: The following text is from Charlotte Forten Grimké’s 1888 poem “At Newport.” Oh, deep delight to watch the gladsome waves Exultant leap upon the rugged rocks; Ever repulsed, yet ever rushing on—Filled with a life that will not know defeat; To see the glorious hues of sky and sea. The distant snowy sails, glide spirit like, Into an unknown world, to feel the sweet Enchantment of the sea thrill all the soul, Clearing the clouded brain, making the heart Leap joyous as it own bright, singing waves!\n"
    "[Question]: Which choice best describes the function of the portion, “Ever repulsed, yet ever rushing on—Filled with a life that will not know defeat;” in the text as a whole?\n"
    "[Options]:\n"
    "A) It portrays the surroundings as an imposing and intimidating scene.\n"
    "B) It characterizes the sea’s waves as a relentless and enduring force.\n"
    "C) It conveys the speaker’s ambivalence about the natural world.\n"
    "D) It draws a contrast between the sea’s waves and the speaker’s thoughts.\n"
    "Step-by-step reasoning:\n"
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 544 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: The following text is adapted from Cynthia Kadohata's 2004 novel Kira-Kira. [Uncle Katsuhisa] was as loud as my father was quiet. Even when he wasn't talking, he made a lot of noise, clearing his throat and sniffing and tapping his fingers.
[Question]: Which choice best describes the function of this "[Uncle Katsuhisa] was as loud as my father was quiet. " portion of the sentence?
[Options]:
A) It lists the kinds of topics Uncle Katsuhisa enjoys discussing.
B) It suggests that Uncle Katsuhisa dislikes meeting new people.
C) It contrasts Uncle Katsuhisa with the narrator's father.
D) It describes a conversation between the narrator and the narrator's father.
Step-by-step reasoning:
Choice C is the best answer because it most accurately describes how the underlined sentence functions in the text as a whole. The underlined sentence establishes a difference between Uncle Katsuhisa and the narrator’s father 

In [169]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice B is the best answer because it most accurately describes how the underlined portion functions in the text as a whole. The text presents the speaker’s experience of viewing the sea. In the underlined portion, the speaker focuses on the idea that the waves hitting rocks on the shore are a relentless and enduring force: they are constantly pushed back (“ever repulsed”) but always return (“ever rushing on”), as though they have an energy that can’t be overcome (“a life that will not know defeat”).'''
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.516
ROUGE-1_F1: 0.224
ROUGE-2_F1: 0.033
ROUGE-L_F1: 0.128


Scoring Scheme:
{0}: Irrelevant answer = Nothing from the model is relevant to the context 
{1}: One or two context related words, 
{2}: Few related words, 
{3}: Some reasoning present, 
{4}: Some reasoning and transition words used, 
{5}: Complete and faithful reasoning.

# RACHE-H 15 Qs

# Summary type Q's

In [170]:
def generate_cot_response(prompt, max_new_tokens=150):
    torch.manual_seed(42)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
    stop_token_ids = tokenizer.encode("[END]", add_special_tokens=False)
    
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        do_sample=False,
        num_beams=5,
        no_repeat_ngram_size=3,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=stop_token_ids
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def evaluate_response(generated, reference):
    P, R, F1 = score([generated], [reference], lang="en", model_type="bert-base-uncased")
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {
        "BERTScore_F1": F1.item(),
        "ROUGE-1_F1": scores['rouge1'].fmeasure,
        "ROUGE-2_F1": scores['rouge2'].fmeasure,
        "ROUGE-L_F1": scores['rougeL'].fmeasure
    }

In [171]:
cot_prompt = (
    """[Example 1]
[Context]: Jenny went to visit her friends in New York last weekend.Her friends met her at the airport on Friday afternoon and drove her to the hotel.They had dinner at a Chinese restaurant and went to see a film after that. Jenny and her friends set out early on Saturday morning for a farm and stayed there until Sunday morning.During their stay, they went fishing and swimming in the small river on the farm.They played football in the field and enjoyed a big meal around a camp fire , singing and dancing till late into the night. Nobody could get up early on Sunday morning.So when they got back to New York City, it was about three o'clock in the afternoon.They drove right to the airport because Jenny didn't want to miss her plane back home.Jenny only stayed in New York for two nights but she had a great time with her friends.
[Question]: When did Jenny go back home?
[Options]:
A) On Saturday afternoon.
B) On Sunday morning.
C) On Saturday evening.
D) On Sunday afternoon.
Step-by-step reasoning:
Choice D is the correct answer because it accurately identifies when Jenny went back home, as described in the passage. The passage states that Jenny and her friends returned to New York City on Sunday around three o’clock in the afternoon and drove directly to the airport because Jenny didn’t want to miss her plane. This indicates she went home on Sunday afternoon, making Choice D, “On Sunday afternoon,” the correct answer.

[Current Problem]
[Context]: George Bernard Shaw and Winston Churchill disliked each other. It is said that the playwright  once sent Churchill two tickets for the opening night of one of his plays, together with a card, which said, "Bring a friend (if you have one)." Churchill, however, returned the tickets with a note, which said, "I shall be busy that evening. Please send me two tickets for the second night (if there is one)." There is no record of whether Shaw ever sent the tickets.
[Question]: Why didn't Churchill want the tickets for the first night?
[Options]:
A) He didn't want to take Shaw's insult.
B) The theatre would not be as crowded the second night.
C) He was busy on the first night of the show.
D) He couldn't find a friend to go with him the first night.
Step-by-step reasoning:
"""
)

# Rest of your code...

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 521 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: Jenny went to visit her friends in New York last weekend.Her friends met her at the airport on Friday afternoon and drove her to the hotel.They had dinner at a Chinese restaurant and went to see a film after that. Jenny and her friends set out early on Saturday morning for a farm and stayed there until Sunday morning.During their stay, they went fishing and swimming in the small river on the farm.They played football in the field and enjoyed a big meal around a camp fire , singing and dancing till late into the night. Nobody could get up early on Sunday morning.So when they got back to New York City, it was about three o'clock in the afternoon.They drove right to the airport because Jenny didn't want to miss her plane back home.Jenny only stayed in New York for two nights but she had a great time with her friends.
[Question]: When did Jenny go back home?
[Options]:
A) On Saturday afternoon.
B) On Sunday

In [172]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice A is the best answer because it correctly explains why Churchill rejected the first-night tickets. Shaw’s insulting note, “Bring a friend (if you have one),” implies Churchill lacks friends, and Churchill’s sharp reply suggests he returned the tickets to avoid accepting the insult, not due to being busy, crowd concerns, or lacking a friend, as supported by their mutual dislike in the text.'''
)

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.540
ROUGE-1_F1: 0.397
ROUGE-2_F1: 0.065
ROUGE-L_F1: 0.167


In [173]:
cot_prompt = (
    """[Example 1]
[Context]: Jenny went to visit her friends in New York last weekend.Her friends met her at the airport on Friday afternoon and drove her to the hotel.They had dinner at a Chinese restaurant and went to see a film after that. Jenny and her friends set out early on Saturday morning for a farm and stayed there until Sunday morning.During their stay, they went fishing and swimming in the small river on the farm.They played football in the field and enjoyed a big meal around a camp fire , singing and dancing till late into the night. Nobody could get up early on Sunday morning.So when they got back to New York City, it was about three o'clock in the afternoon.They drove right to the airport because Jenny didn't want to miss her plane back home.Jenny only stayed in New York for two nights but she had a great time with her friends.
[Question]: When did Jenny go back home?
[Options]:
A) On Saturday afternoon.
B) On Sunday morning.
C) On Saturday evening.
D) On Sunday afternoon.
Step-by-step reasoning:
Choice D is the correct answer because it accurately identifies when Jenny went back home, as described in the passage. The passage states that Jenny and her friends returned to New York City on Sunday around three o’clock in the afternoon and drove directly to the airport because Jenny didn’t want to miss her plane. This indicates she went home on Sunday afternoon, making Choice D, “On Sunday afternoon,” the correct answer.

[Current Problem]
[Context]: When George was thirty-five, he bought a small plane and learned to fly it.  He soon became very good and made his plane do all kinds of tricks  . George had a friend, whose name was Mark. One day George offered to take Mark up in his plane. Mark thought, " I've traveled in a big plane several times, but I've never been in a small one, so I'll go. "They went up, and George flew around for half an hour and did all kinds of tricks in the air. When they came down again,  Mark was glad to be back safely, and he said to his friend in a shaking voice, "Well , George,  thank you very much for those two trips in your plane."George was very surprised and said, " Two trips? "" Yes, my first and my last. " answered Mark.
[Question]: Which of the following statements is false?
[Options]:
A) George learned to fly a plane very quickly .
B) It took George a short time to learn to fly a plane.
C) George had some difficulty learning to fly a plane.
D) Mark decided to fly in George's small plane.
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 590 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: Jenny went to visit her friends in New York last weekend.Her friends met her at the airport on Friday afternoon and drove her to the hotel.They had dinner at a Chinese restaurant and went to see a film after that. Jenny and her friends set out early on Saturday morning for a farm and stayed there until Sunday morning.During their stay, they went fishing and swimming in the small river on the farm.They played football in the field and enjoyed a big meal around a camp fire , singing and dancing till late into the night. Nobody could get up early on Sunday morning.So when they got back to New York City, it was about three o'clock in the afternoon.They drove right to the airport because Jenny didn't want to miss her plane back home.Jenny only stayed in New York for two nights but she had a great time with her friends.
[Question]: When did Jenny go back home?
[Options]:
A) On Saturday afternoon.
B) On Sunday

In [174]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice C is the best answer because it is the false statement about the passage. The passage states that George, at thirty-five, bought a small plane, learned to fly it, and “soon became very good,” indicating he learned quickly and proficiently. However, Choice C is false, as the passage provides no evidence of difficulty, only that he learned quickly and excelled.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.584
ROUGE-1_F1: 0.327
ROUGE-2_F1: 0.039
ROUGE-L_F1: 0.154


In [175]:
cot_prompt = (
    """[Example 1]
[Context]: Jenny went to visit her friends in New York last weekend.Her friends met her at the airport on Friday afternoon and drove her to the hotel.They had dinner at a Chinese restaurant and went to see a film after that. Jenny and her friends set out early on Saturday morning for a farm and stayed there until Sunday morning.During their stay, they went fishing and swimming in the small river on the farm.They played football in the field and enjoyed a big meal around a camp fire , singing and dancing till late into the night. Nobody could get up early on Sunday morning.So when they got back to New York City, it was about three o'clock in the afternoon.They drove right to the airport because Jenny didn't want to miss her plane back home.Jenny only stayed in New York for two nights but she had a great time with her friends.
[Question]: When did Jenny go back home?
[Options]:
A) On Saturday afternoon.
B) On Sunday morning.
C) On Saturday evening.
D) On Sunday afternoon.
Step-by-step reasoning:
Choice D is the correct answer because it accurately identifies when Jenny went back home, as described in the passage. The passage states that Jenny and her friends returned to New York City on Sunday around three o’clock in the afternoon and drove directly to the airport because Jenny didn’t want to miss her plane. This indicates she went home on Sunday afternoon, making Choice D, “On Sunday afternoon,” the correct answer.

[Current Problem]
[Context]: On the morning of November 18, 1755, an earthquake shock Boston. John Winthrop, a professor at Harvard College, felt the quake and awoke. "I rose", Winthrop wrote, "and lighted a candle, looked at my watch, and found it to be 15minutes after four." John Winthrop hurried downstairs to the grandfather clock. It had stopped three minutes before, at 4:12. Except for stopping the clock, the quake had only thrown a key from the mantel  to the floor. The clock had stopped because Winthrop had put some long glass tubes he was using for an experiment into the case for safekeeping. The quake had knocked the tubes over and blocked the pendulum .Winthrop, therefore, had the key on the floor. The quake had thrown _ forward in the direction of the quake' s motion by a shock coming from the northwest, perhaps in Canada.
[Question]: John Winthrop put some tubes into this clock case because  _  .
[Options]:
A) he wanted to do an experiment
B) he thought an earthquake was probably to happen soon
C) he thought it safe for them to be put there
D) he wanted to record the exam time of the earthquake
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 598 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: Jenny went to visit her friends in New York last weekend.Her friends met her at the airport on Friday afternoon and drove her to the hotel.They had dinner at a Chinese restaurant and went to see a film after that. Jenny and her friends set out early on Saturday morning for a farm and stayed there until Sunday morning.During their stay, they went fishing and swimming in the small river on the farm.They played football in the field and enjoyed a big meal around a camp fire , singing and dancing till late into the night. Nobody could get up early on Sunday morning.So when they got back to New York City, it was about three o'clock in the afternoon.They drove right to the airport because Jenny didn't want to miss her plane back home.Jenny only stayed in New York for two nights but she had a great time with her friends.
[Question]: When did Jenny go back home?
[Options]:
A) On Saturday afternoon.
B) On Sunday

In [176]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice C is the best answer because it accurately explains why John Winthrop put the glass tubes into the clock case, as described in the passage. The passage states that Winthrop placed the tubes in the case “for safekeeping,” indicating he believed it was a safe place to store them.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.496
ROUGE-1_F1: 0.283
ROUGE-2_F1: 0.041
ROUGE-L_F1: 0.222


In [177]:
cot_prompt = (
    """[Example 1]
[Context]: On the morning of November 18, 1755, an earthquake shock Boston. John Winthrop, a professor at Harvard College, felt the quake and awoke. "I rose", Winthrop wrote, "and lighted a candle, looked at my watch, and found it to be 15minutes after four." John Winthrop hurried downstairs to the grandfather clock. It had stopped three minutes before, at 4:12. Except for stopping the clock, the quake had only thrown a key from the mantel  to the floor. The clock had stopped because Winthrop had put some long glass tubes he was using for an experiment into the case for safekeeping. The quake had knocked the tubes over and blocked the pendulum .Winthrop, therefore, had the key on the floor. The quake had thrown _ forward in the direction of the quake' s motion by a shock coming from the northwest, perhaps in Canada.
[Question]: John Winthrop put some tubes into this clock case because  _  .
[Options]:
A) he wanted to do an experiment
B) he thought an earthquake was probably to happen soon
C) he thought it safe for them to be put there
D) he wanted to record the exam time of the earthquake
Step-by-step reasoning:
Choice C is the best answer because it accurately explains why John Winthrop put the glass tubes into the clock case, as described in the passage. The passage states that Winthrop placed the tubes in the case “for safekeeping,” indicating he believed it was a safe place to store them.

[Current Problem]
[Context]: Are you hard-working? Do you like to meet people? If your answer is "Yes", then we have a job for you as a waiter. Call AL Hotel at 556779! (2) SUMMER JOB Do you like to talk with people? Do you like to write stories? If you want to work for our magazine as a reporter, please call Karen at 558366. (3) HELP WANTED Do you like babies? Can you look after one baby for two days? If you are sure to take food care of it, call us at 766588. $80 or more. Today! Hurry! (4)CLEANER WANTED Can you make a large house clean and tidy? If you hope to get the job paid at $20 once, call us this evening 18:00-20:00.   Tel:633800
[Question]: The above job ads  are probably from a  _.
[Options]:
A) newspaper
B) story book
C) TV show
D) radio program
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 571 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: On the morning of November 18, 1755, an earthquake shock Boston. John Winthrop, a professor at Harvard College, felt the quake and awoke. "I rose", Winthrop wrote, "and lighted a candle, looked at my watch, and found it to be 15minutes after four." John Winthrop hurried downstairs to the grandfather clock. It had stopped three minutes before, at 4:12. Except for stopping the clock, the quake had only thrown a key from the mantel  to the floor. The clock had stopped because Winthrop had put some long glass tubes he was using for an experiment into the case for safekeeping. The quake had knocked the tubes over and blocked the pendulum .Winthrop, therefore, had the key on the floor. The quake had thrown _ forward in the direction of the quake' s motion by a shock coming from the northwest, perhaps in Canada.
[Question]: John Winthrop put some tubes into this clock case because  _  .
[Options]:
A) he wanted

In [178]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice A is the best answer because it correctly identifies the source of the job ads as a newspaper. The passage presents four job advertisements with brief descriptions, contact numbers, and specific details (e.g., pay, hours), formatted in a concise, text-based style typical of classified sections in newspapers.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.484
ROUGE-1_F1: 0.222
ROUGE-2_F1: 0.034
ROUGE-L_F1: 0.133


In [179]:
cot_prompt = (
    """[Example 1]
[Context]: On the morning of November 18, 1755, an earthquake shock Boston. John Winthrop, a professor at Harvard College, felt the quake and awoke. "I rose", Winthrop wrote, "and lighted a candle, looked at my watch, and found it to be 15minutes after four." John Winthrop hurried downstairs to the grandfather clock. It had stopped three minutes before, at 4:12. Except for stopping the clock, the quake had only thrown a key from the mantel  to the floor. The clock had stopped because Winthrop had put some long glass tubes he was using for an experiment into the case for safekeeping. The quake had knocked the tubes over and blocked the pendulum .Winthrop, therefore, had the key on the floor. The quake had thrown _ forward in the direction of the quake' s motion by a shock coming from the northwest, perhaps in Canada.
[Question]: John Winthrop put some tubes into this clock case because  _  .
[Options]:
A) he wanted to do an experiment
B) he thought an earthquake was probably to happen soon
C) he thought it safe for them to be put there
D) he wanted to record the exam time of the earthquake
Step-by-step reasoning:
Choice C is the best answer because it accurately explains why John Winthrop put the glass tubes into the clock case, as described in the passage. The passage states that Winthrop placed the tubes in the case “for safekeeping,” indicating he believed it was a safe place to store them.

[Current Problem]
[Context]: My father, at the death of his father, was six years old, and he grew up without education. He moved from Kentucky to Indiana when I was seven. We reached our new home about the time the state came into the Union. It was a wild area, with many bears and other wild animals still in the woods. I grew up there. There were some so-called schools, but what was required of a teacher never went beyond "reading, writing, and adding." If a stranger supposed to understand Latin happened to live for a time in the area, he was looked on as wizard  . There was simply nothing to excite a desire for education. Of course, when I grew up, I did not know much. Still, somehow, I could read, write, and add, but that was all. The advance I have now made is on this store of education, which I have picked up under the pressure of necessity.
[Question]: How did the writer look at his early education?
[Options]:
A) He believed he met the school requirements.
B) He thought he was well-educated.
C) He thought it was not satisfactory.
D) He believed he was poorly educated.
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 602 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: On the morning of November 18, 1755, an earthquake shock Boston. John Winthrop, a professor at Harvard College, felt the quake and awoke. "I rose", Winthrop wrote, "and lighted a candle, looked at my watch, and found it to be 15minutes after four." John Winthrop hurried downstairs to the grandfather clock. It had stopped three minutes before, at 4:12. Except for stopping the clock, the quake had only thrown a key from the mantel  to the floor. The clock had stopped because Winthrop had put some long glass tubes he was using for an experiment into the case for safekeeping. The quake had knocked the tubes over and blocked the pendulum .Winthrop, therefore, had the key on the floor. The quake had thrown _ forward in the direction of the quake' s motion by a shock coming from the northwest, perhaps in Canada.
[Question]: John Winthrop put some tubes into this clock case because  _  .
[Options]:
A) he wanted

In [180]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice C is the best answer because it accurately reflects how the writer viewed his early education. The passage describes the writer’s childhood in a “wild area” with limited schools where teachers only needed basic skills (“reading, writing, and adding”). The writer states, “Of course, when I grew up, I did not know much,” and notes that his education was minimal, only covering basic literacy and arithmetic, which he later built upon “under the pressure of necessity.” This indicates he found his early education unsatisfactory.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.492
ROUGE-1_F1: 0.231
ROUGE-2_F1: 0.028
ROUGE-L_F1: 0.136


In [181]:
cot_prompt = (
    """[Example 1]
[Context]: On the morning of November 18, 1755, an earthquake shock Boston. John Winthrop, a professor at Harvard College, felt the quake and awoke. "I rose", Winthrop wrote, "and lighted a candle, looked at my watch, and found it to be 15minutes after four." John Winthrop hurried downstairs to the grandfather clock. It had stopped three minutes before, at 4:12. Except for stopping the clock, the quake had only thrown a key from the mantel  to the floor. The clock had stopped because Winthrop had put some long glass tubes he was using for an experiment into the case for safekeeping. The quake had knocked the tubes over and blocked the pendulum .Winthrop, therefore, had the key on the floor. The quake had thrown _ forward in the direction of the quake' s motion by a shock coming from the northwest, perhaps in Canada.
[Question]: John Winthrop put some tubes into this clock case because  _  .
[Options]:
A) he wanted to do an experiment
B) he thought an earthquake was probably to happen soon
C) he thought it safe for them to be put there
D) he wanted to record the exam time of the earthquake
Step-by-step reasoning:
Choice C is the best answer because it accurately explains why John Winthrop put the glass tubes into the clock case, as described in the passage. The passage states that Winthrop placed the tubes in the case “for safekeeping,” indicating he believed it was a safe place to store them.

[Current Problem]
[Context]: Christmas Day, the birthday of Jesus Christ, is the most important festival in prefix = st1 /Britainand some other countries. On Christmas Eve, people usually tell their children to put their stockings at the end of their beds before they go to sleep. Children believe Santa Claus, with the other name of Father Christmas, will come during the night and fill their stockings with Christmas presents. Actually , Father Christmas is children's father. He dresses up in a red coat and waits until children fall asleep. Then he goes into children's bedrooms, and puts small presents in their stockings. When children are no longer young, they know who Father Christmas really is.Not only children but also their parents enjoy Christmas stockings. They also have stockings. Early on the morning of Christmas Day, children wake their parents up and say"Merry Christmas". Then they help their parents open their stockings. Everybody likes presents. But it is better to give than to receive.
[Question]: What do all the British children do on Christmas Eve in the story?
[Options]:
A) They talk all the night.
B) They sing and dance.
C) They put their stockings at the end of their beds.
D) They won't sleep until Father Christmas comes.
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 614 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: On the morning of November 18, 1755, an earthquake shock Boston. John Winthrop, a professor at Harvard College, felt the quake and awoke. "I rose", Winthrop wrote, "and lighted a candle, looked at my watch, and found it to be 15minutes after four." John Winthrop hurried downstairs to the grandfather clock. It had stopped three minutes before, at 4:12. Except for stopping the clock, the quake had only thrown a key from the mantel  to the floor. The clock had stopped because Winthrop had put some long glass tubes he was using for an experiment into the case for safekeeping. The quake had knocked the tubes over and blocked the pendulum .Winthrop, therefore, had the key on the floor. The quake had thrown _ forward in the direction of the quake' s motion by a shock coming from the northwest, perhaps in Canada.
[Question]: John Winthrop put some tubes into this clock case because  _  .
[Options]:
A) he wanted

In [182]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice C is the best answer because it accurately describes what all British children do on Christmas Eve according to the passage. The text states, “On Christmas Eve, people usually tell their children to put their stockings at the end of their beds before they go to sleep,” indicating this is a common practice for children.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.456
ROUGE-1_F1: 0.163
ROUGE-2_F1: 0.011
ROUGE-L_F1: 0.109


In [183]:
cot_prompt = (
    """[Example 1]
[Context]: On the morning of November 18, 1755, an earthquake shock Boston. John Winthrop, a professor at Harvard College, felt the quake and awoke. "I rose", Winthrop wrote, "and lighted a candle, looked at my watch, and found it to be 15minutes after four." John Winthrop hurried downstairs to the grandfather clock. It had stopped three minutes before, at 4:12. Except for stopping the clock, the quake had only thrown a key from the mantel  to the floor. The clock had stopped because Winthrop had put some long glass tubes he was using for an experiment into the case for safekeeping. The quake had knocked the tubes over and blocked the pendulum .Winthrop, therefore, had the key on the floor. The quake had thrown _ forward in the direction of the quake' s motion by a shock coming from the northwest, perhaps in Canada.
[Question]: John Winthrop put some tubes into this clock case because  _  .
[Options]:
A) he wanted to do an experiment
B) he thought an earthquake was probably to happen soon
C) he thought it safe for them to be put there
D) he wanted to record the exam time of the earthquake
Step-by-step reasoning:
Choice C is the best answer because it accurately explains why John Winthrop put the glass tubes into the clock case, as described in the passage. The passage states that Winthrop placed the tubes in the case “for safekeeping,” indicating he believed it was a safe place to store them.

[Current Problem]
[Context]: Mars is the planet most like Earth.Flying to Mars is a difficult task for humans. There is an experiment with the name "Mars 500" in Russia.Six volunteers take part in the experiment.They come from China, Russia, France and Italy.They won't go to outer space.They will stay in small modules   on the ground for 520 days.It's just like a real journey In the modules, they will experience a lot: flying to Mars, flying back and working on Mars.During the long "journey", the volunteers can only eat space food.They can shower every 10 days.They can call their friends or families.Anyone of them can _ if he doesn't feel well.This experiment helps scientists know how well humans will be on a long journey to Mars. Wangyue, 26, from China, is the youngest of the volunteers.He is a teacher at the China Astronaut Research and Training Center   in Beijing.
[Question]: Which of the statement is Not True?
[Options]:
A) The volunteers are not going to move to outer space.
B) The volunteers will stay on the Mars for more than 17 months.
C) The volunteers can make video calls to their friends and families
D) The name of the experiment is "Mars 500".
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 620 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: On the morning of November 18, 1755, an earthquake shock Boston. John Winthrop, a professor at Harvard College, felt the quake and awoke. "I rose", Winthrop wrote, "and lighted a candle, looked at my watch, and found it to be 15minutes after four." John Winthrop hurried downstairs to the grandfather clock. It had stopped three minutes before, at 4:12. Except for stopping the clock, the quake had only thrown a key from the mantel  to the floor. The clock had stopped because Winthrop had put some long glass tubes he was using for an experiment into the case for safekeeping. The quake had knocked the tubes over and blocked the pendulum .Winthrop, therefore, had the key on the floor. The quake had thrown _ forward in the direction of the quake' s motion by a shock coming from the northwest, perhaps in Canada.
[Question]: John Winthrop put some tubes into this clock case because  _  .
[Options]:
A) he wanted

In [184]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice C is the correct answer because it identifies a false statement about the “Mars 500” experiment. The passage states that volunteers “can call” friends or families, but does not mention video calls. Thus, the claim in Choice C that volunteers can make video calls is not supported by the text.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.535
ROUGE-1_F1: 0.264
ROUGE-2_F1: 0.067
ROUGE-L_F1: 0.154


# Fill in the blank type Q's

In [185]:
cot_prompt = (
    """[Example 1]
[Context]: Mrs. Allen's husband died ten years ago when her son and daughter were still in high school. Mr. Allen had left some money, and since Mrs. Allen had managed a bookstore before she was married, she took the money and bought a shop in town. Later she moved the shop out to the shopping center. Laura Barnes, Mrs. Allen's friend and assistant, was also a widow. She had some free time and the need for a little extra-money, and so she took the job in the bookstore. She was too clever and friendly, and the two women were well-known in the neighbourhood as "Mrs. A"and "Mrs. B".
[Question]: How many people in all were there in the two women's families?   _  .
[Options]:
A) At least 8
B) At least 7
C) At least 6
D) At least 4
Step-by-step reasoning:
Choice D is the correct answer because it accurately reflects the minimum number of people in the two women’s families based on the passage. The passage mentions Mrs. Allen, her son, and her daughter (three people), and Laura Barnes, who is a widow but has no children or other family members explicitly noted (one person). Thus, the total is at least four people across both families, making Choice D, “At least 4,” the correct answer.

[Current Problem]
[Context]: A new law helps people with disabilities. The law says that people with disabilities must be able to get into and out of all public buildings. It also says that business must offer special services to people who have special needs. Companies can not refuse to hire disabled workers. Many businesses may have to change their buildings and services. --Ramps must be built so people can get into buildings. --Movie theatres must have space for people in wheelchairs and seats for their friends to sit near them. --Elevators must have floor number in  _ . This law will help millions of people. One woman who has been in a wheelchair for many years said, "It is like a dream."
[Question]: Ramps can help people _.
[Options]:
A) with hearing problems
B) who have difficulty in using their hands
C) who have difficulty in using their legs or feet
D) who don't like stairs
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 508 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: Mrs. Allen's husband died ten years ago when her son and daughter were still in high school. Mr. Allen had left some money, and since Mrs. Allen had managed a bookstore before she was married, she took the money and bought a shop in town. Later she moved the shop out to the shopping center. Laura Barnes, Mrs. Allen's friend and assistant, was also a widow. She had some free time and the need for a little extra-money, and so she took the job in the bookstore. She was too clever and friendly, and the two women were well-known in the neighbourhood as "Mrs. A"and "Mrs. B".
[Question]: How many people in all were there in the two women's families?   _  .
[Options]:
A) At least 8
B) At least 7
C) At least 6
D) At least 4
Step-by-step reasoning:
Choice D is the correct answer because it accurately reflects the minimum number of people in the two women’s families based on the passage. The passage mentions Mrs. 

In [186]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice C is the correct answer because it accurately identifies who ramps help, as described in the passage. The passage states, “Ramps must be built so people can get into buildings,” in the context of a law ensuring accessibility for people with disabilities, particularly those using wheelchairs. Ramps specifically aid individuals with difficulty using their legs or feet, such as wheelchair users, by providing an alternative to stairs, making Choice C the best fit.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.588
ROUGE-1_F1: 0.376
ROUGE-2_F1: 0.057
ROUGE-L_F1: 0.216


In [187]:
cot_prompt = (
    """[Example 1]
[Context]: Mrs. Allen's husband died ten years ago when her son and daughter were still in high school. Mr. Allen had left some money, and since Mrs. Allen had managed a bookstore before she was married, she took the money and bought a shop in town. Later she moved the shop out to the shopping center. Laura Barnes, Mrs. Allen's friend and assistant, was also a widow. She had some free time and the need for a little extra-money, and so she took the job in the bookstore. She was too clever and friendly, and the two women were well-known in the neighbourhood as "Mrs. A"and "Mrs. B".
[Question]: How many people in all were there in the two women's families?   _  .
[Options]:
A) At least 8
B) At least 7
C) At least 6
D) At least 4
Step-by-step reasoning:
Choice D is the correct answer because it accurately reflects the minimum number of people in the two women’s families based on the passage. The passage mentions Mrs. Allen, her son, and her daughter (three people), and Laura Barnes, who is a widow but has no children or other family members explicitly noted (one person). Thus, the total is at least four people across both families, making Choice D, “At least 4,” the correct answer.

[Current Problem]
[Context]: Even if  you are a good high-jumper, you can jump only about seven feet off the ground. You cannot jump any higher because the earth pulls you hard. The pull of the earth is called gravity. You can easily find out the pull of the earth. If you weigh yourself, you will know how much gravity is pulling you. Since there is gravity, water runs down hill. When you throw a ball into the air, it falls back down. Because of gravity, you do not fall off the earth as it whirls   around. Then, can we get away from the earth and go far out into space? Now you can do it, because spaceships have been invented. Then spaceship will go so fast that it can escape   the earth's gravity and carry you into space."
[Question]: In this passage, the word "gravity" means  _  .
[Options]:
A) the pull of everything.
B) the force of attraction  among objects.
C) the force which attracts objects towards the centre of the earth
D) the force which attracts the earth towards the sun.
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 551 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: Mrs. Allen's husband died ten years ago when her son and daughter were still in high school. Mr. Allen had left some money, and since Mrs. Allen had managed a bookstore before she was married, she took the money and bought a shop in town. Later she moved the shop out to the shopping center. Laura Barnes, Mrs. Allen's friend and assistant, was also a widow. She had some free time and the need for a little extra-money, and so she took the job in the bookstore. She was too clever and friendly, and the two women were well-known in the neighbourhood as "Mrs. A"and "Mrs. B".
[Question]: How many people in all were there in the two women's families?   _  .
[Options]:
A) At least 8
B) At least 7
C) At least 6
D) At least 4
Step-by-step reasoning:
Choice D is the correct answer because it accurately reflects the minimum number of people in the two women’s families based on the passage. The passage mentions Mrs. 

In [188]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice C is the correct answer because it accurately defines “gravity” as used in the passage. The passage explains gravity as the force that pulls objects toward the Earth, stating, “The pull of the earth is called gravity,” and illustrates this with examples like water running downhill, a ball falling back to the ground, and people not falling off the Earth. Choice C, “the force which attracts objects towards the centre of the earth,” precisely matches this description, capturing the Earth-centered attractive force described throughout the text.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.554
ROUGE-1_F1: 0.398
ROUGE-2_F1: 0.111
ROUGE-L_F1: 0.209


In [189]:
cot_prompt = (
    """[Example 1]
[Context]: Mrs. Allen's husband died ten years ago when her son and daughter were still in high school. Mr. Allen had left some money, and since Mrs. Allen had managed a bookstore before she was married, she took the money and bought a shop in town. Later she moved the shop out to the shopping center. Laura Barnes, Mrs. Allen's friend and assistant, was also a widow. She had some free time and the need for a little extra-money, and so she took the job in the bookstore. She was too clever and friendly, and the two women were well-known in the neighbourhood as "Mrs. A"and "Mrs. B".
[Question]: How many people in all were there in the two women's families?   _  .
[Options]:
A) At least 8
B) At least 7
C) At least 6
D) At least 4
Step-by-step reasoning:
Choice D is the correct answer because it accurately reflects the minimum number of people in the two women’s families based on the passage. The passage mentions Mrs. Allen, her son, and her daughter (three people), and Laura Barnes, who is a widow but has no children or other family members explicitly noted (one person). Thus, the total is at least four people across both families, making Choice D, “At least 4,” the correct answer.

[Current Problem]
[Context]: One day Mr. Brown sees a young woman in the street with children. He is very surprised because all the children are wearing the same clothes. White caps, blue coats and yellow trousers. "Are all these children yours?" he asks the woman. "Yes, they are." she answers. "Do you always dress them in the same clothes ?" asks Mr. Brown."Yes," answers the mother. "When we have four children, we dress them in the same clothes because we don't want to lose any of them. It is easy to see our children among other children because they are all wearing the same clothes. And now we have ten, we dress them like this because we don't want to take other children home by mistake. When there are other children among ours, it is easy to see them because their clothes are different.
[Question]: How many people does Mr. Brown see in the street one day? He sees   _  in all.
[Options]:
A) twelve
B) eleven
C) four
D) ten children
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 536 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: Mrs. Allen's husband died ten years ago when her son and daughter were still in high school. Mr. Allen had left some money, and since Mrs. Allen had managed a bookstore before she was married, she took the money and bought a shop in town. Later she moved the shop out to the shopping center. Laura Barnes, Mrs. Allen's friend and assistant, was also a widow. She had some free time and the need for a little extra-money, and so she took the job in the bookstore. She was too clever and friendly, and the two women were well-known in the neighbourhood as "Mrs. A"and "Mrs. B".
[Question]: How many people in all were there in the two women's families?   _  .
[Options]:
A) At least 8
B) At least 7
C) At least 6
D) At least 4
Step-by-step reasoning:
Choice D is the correct answer because it accurately reflects the minimum number of people in the two women’s families based on the passage. The passage mentions Mrs. 

In [190]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice D is the correct answer because it accurately states the number of people Mr. Brown sees in the street, as described in the passage. The passage mentions Mr. Brown seeing a young woman with children, and the woman later says, “And now we have ten,” referring to her ten children. Thus, Mr. Brown sees the woman plus her ten children, totaling eleven people, but the question’s phrasing and options suggest it seeks the number of children, making “ten children” (Choice D) the correct fit based on the mother’s statement.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.504
ROUGE-1_F1: 0.276
ROUGE-2_F1: 0.040
ROUGE-L_F1: 0.211


In [191]:
cot_prompt = (
    """[Example 1]
[Context]: A new law helps people with disabilities. The law says that people with disabilities must be able to get into and out of all public buildings. It also says that business must offer special services to people who have special needs. Companies can not refuse to hire disabled workers. Many businesses may have to change their buildings and services. --Ramps must be built so people can get into buildings. --Movie theatres must have space for people in wheelchairs and seats for their friends to sit near them. --Elevators must have floor number in  _ . This law will help millions of people. One woman who has been in a wheelchair for many years said, "It is like a dream."
[Question]: Ramps can help people_.
[Options]:
A) with hearing problems
B) who have difficulty in using their hands
C) who have difficulty in using their legs or feet
D) who don't like stairs
Step-by-step reasoning:
Choice C is the correct answer because it accurately identifies who ramps help, as described in the passage. The passage states, “Ramps must be built so people can get into buildings,” in the context of a law ensuring accessibility for people with disabilities, particularly those using wheelchairs. Ramps specifically aid individuals with difficulty using their legs or feet, such as wheelchair users, by providing an alternative to stairs, making Choice C the best fit.

[Current Problem]
[Context]: Mrs. Allen's husband died ten years ago when her son and daughter were still in high school. Mr. Allen had left some money, and since Mrs. Allen had managed a bookstore before she was married, she took the money and bought a shop in town. Later she moved the shop out to the shopping center. She was a pleasant woman; she worked hard and was well-informed about books, and so the bookstore prospered and she was able to hire a friend to help her. Laura Barnes, Mrs. Allen's friend and assistant, was also a widow. She had some free time and the need for a little extra-money, and so she took the job in the bookstore. She was too clever and friendly, and the two women were well-known in the neighbourhood as "Mrs. A"and "Mrs. B".
[Question]: The bookstore prospered   _  .
[Options]:
A) after Mrs. Allen lost control of it
B) before Mrs. A got to know Mrs. B
C) because the two women had little housework to do
D) because Mrs. A became expert at management and threw herself into it
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 558 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: A new law helps people with disabilities. The law says that people with disabilities must be able to get into and out of all public buildings. It also says that business must offer special services to people who have special needs. Companies can not refuse to hire disabled workers. Many businesses may have to change their buildings and services. --Ramps must be built so people can get into buildings. --Movie theatres must have space for people in wheelchairs and seats for their friends to sit near them. --Elevators must have floor number in  _ . This law will help millions of people. One woman who has been in a wheelchair for many years said, "It is like a dream."
[Question]: Ramps can help people_.
[Options]:
A) with hearing problems
B) who have difficulty in using their hands
C) who have difficulty in using their legs or feet
D) who don't like stairs
Step-by-step reasoning:
Choice C is the correct ans

In [192]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice D is the correct answer because it accurately explains why the bookstore prospered, as described in the passage. The passage states that Mrs. Allen was a pleasant, hard-working, and well-informed woman who managed the bookstore, implying her expertise and dedication were key to its success. Choice D, “because Mrs. A became expert at management and threw herself into it,” aligns with these qualities, highlighting her management skills and commitment as the reasons for the bookstore’s prosperity.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.619
ROUGE-1_F1: 0.362
ROUGE-2_F1: 0.144
ROUGE-L_F1: 0.257


In [193]:
cot_prompt = (
    """[Example 1]
[Context]: A new law helps people with disabilities. The law says that people with disabilities must be able to get into and out of all public buildings. It also says that business must offer special services to people who have special needs. Companies can not refuse to hire disabled workers. Many businesses may have to change their buildings and services. --Ramps must be built so people can get into buildings. --Movie theatres must have space for people in wheelchairs and seats for their friends to sit near them. --Elevators must have floor number in  _ . This law will help millions of people. One woman who has been in a wheelchair for many years said, "It is like a dream."
[Question]: Ramps can help people_.
[Options]:
A) with hearing problems
B) who have difficulty in using their hands
C) who have difficulty in using their legs or feet
D) who don't like stairs
Step-by-step reasoning:
Choice C is the correct answer because it accurately identifies who ramps help, as described in the passage. The passage states, “Ramps must be built so people can get into buildings,” in the context of a law ensuring accessibility for people with disabilities, particularly those using wheelchairs. Ramps specifically aid individuals with difficulty using their legs or feet, such as wheelchair users, by providing an alternative to stairs, making Choice C the best fit.

[Current Problem]
[Context]: Jenny went to visit her friends in New York last weekend.Her friends met her at the airport on Friday afternoon and drove her to the hotel.They had dinner at a Chinese restaurant and went to see a film after that. Jenny and her friends set out early on Saturday morning for a farm and stayed there until Sunday morning.During their stay, they went fishing and swimming in the small river on the farm.They played football in the field and enjoyed a big meal around a camp fire , singing and dancing till late into the night. Nobody could get up early on Sunday morning.So when they got back to New York City, it was about three o'clock in the afternoon.They drove right to the airport because Jenny didn't want to miss her plane back home.Jenny only stayed in New York for two nights but she had a great time with her friends.
[Question]: The bookstore prospered   _  .
[Options]:
A) Jenny went to New York  _  .
B) to do some shopping
C) to see her friends
D) to spend her summer holiday
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 543 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: A new law helps people with disabilities. The law says that people with disabilities must be able to get into and out of all public buildings. It also says that business must offer special services to people who have special needs. Companies can not refuse to hire disabled workers. Many businesses may have to change their buildings and services. --Ramps must be built so people can get into buildings. --Movie theatres must have space for people in wheelchairs and seats for their friends to sit near them. --Elevators must have floor number in  _ . This law will help millions of people. One woman who has been in a wheelchair for many years said, "It is like a dream."
[Question]: Ramps can help people_.
[Options]:
A) with hearing problems
B) who have difficulty in using their hands
C) who have difficulty in using their legs or feet
D) who don't like stairs
Step-by-step reasoning:
Choice C is the correct ans

In [194]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice B is the correct answer because it accurately states Jenny’s reason for going to New York, as described in the passage. The passage begins by saying, “Jenny went to visit her friends in New York last weekend,” clearly indicating that her purpose was to see her friends. Choice B, “to see her friends,” directly aligns with this statement, reflecting her primary motivation for the trip.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.439
ROUGE-1_F1: 0.187
ROUGE-2_F1: 0.032
ROUGE-L_F1: 0.104


In [195]:
cot_prompt = (
    """[Example 1]
[Context]: A new law helps people with disabilities. The law says that people with disabilities must be able to get into and out of all public buildings. It also says that business must offer special services to people who have special needs. Companies can not refuse to hire disabled workers. Many businesses may have to change their buildings and services. --Ramps must be built so people can get into buildings. --Movie theatres must have space for people in wheelchairs and seats for their friends to sit near them. --Elevators must have floor number in  _ . This law will help millions of people. One woman who has been in a wheelchair for many years said, "It is like a dream."
[Question]: Ramps can help people_.
[Options]:
A) with hearing problems
B) who have difficulty in using their hands
C) who have difficulty in using their legs or feet
D) who don't like stairs
Step-by-step reasoning:
Choice C is the correct answer because it accurately identifies who ramps help, as described in the passage. The passage states, “Ramps must be built so people can get into buildings,” in the context of a law ensuring accessibility for people with disabilities, particularly those using wheelchairs. Ramps specifically aid individuals with difficulty using their legs or feet, such as wheelchair users, by providing an alternative to stairs, making Choice C the best fit.

[Current Problem]
[Context]: A young man went to a town and worked there. He did not have a wife and a servant did the work in his house. The young man liked laughing a lot. He nailed the servant's shoes to the floor on Monday, and then laughed, because he put his feet in them and fell down. The servant was not angry, but smiled. Then the young man put brushes in his bed on Tuesday. The servant got into bed and hit the brushes with his feet. He was afraid. The young man laughed loudly again. Again the servant was not angry, but smiled. Then on Wednesday the young man said to his servant, "You're a nice, kind man. I am not going to be unkind to you again." The servant smiled and said, "And I'm not going to put any more mud from the street in your coffee."
[Question]: TThe young man went to a town   _  .
[Options]:
A) to study
B) to work
C) to see his relative
D) to spend his holiday
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 534 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: A new law helps people with disabilities. The law says that people with disabilities must be able to get into and out of all public buildings. It also says that business must offer special services to people who have special needs. Companies can not refuse to hire disabled workers. Many businesses may have to change their buildings and services. --Ramps must be built so people can get into buildings. --Movie theatres must have space for people in wheelchairs and seats for their friends to sit near them. --Elevators must have floor number in  _ . This law will help millions of people. One woman who has been in a wheelchair for many years said, "It is like a dream."
[Question]: Ramps can help people_.
[Options]:
A) with hearing problems
B) who have difficulty in using their hands
C) who have difficulty in using their legs or feet
D) who don't like stairs
Step-by-step reasoning:
Choice C is the correct ans

In [196]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice B is the correct answer because it accurately identifies the young man’s reason for going to the town, as stated in the passage. The passage begins, “A young man went to a town and worked there,” directly indicating that his purpose was to work. Choice B, “to work,” aligns with this explicit statement, reflecting his primary motivation for moving to the town.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.468
ROUGE-1_F1: 0.249
ROUGE-2_F1: 0.010
ROUGE-L_F1: 0.135


In [197]:
cot_prompt = (
    """[Example 1]
[Context]: A new law helps people with disabilities. The law says that people with disabilities must be able to get into and out of all public buildings. It also says that business must offer special services to people who have special needs. Companies can not refuse to hire disabled workers. Many businesses may have to change their buildings and services. --Ramps must be built so people can get into buildings. --Movie theatres must have space for people in wheelchairs and seats for their friends to sit near them. --Elevators must have floor number in  _ . This law will help millions of people. One woman who has been in a wheelchair for many years said, "It is like a dream."
[Question]: Ramps can help people_.
[Options]:
A) with hearing problems
B) who have difficulty in using their hands
C) who have difficulty in using their legs or feet
D) who don't like stairs
Step-by-step reasoning:
Choice C is the correct answer because it accurately identifies who ramps help, as described in the passage. The passage states, “Ramps must be built so people can get into buildings,” in the context of a law ensuring accessibility for people with disabilities, particularly those using wheelchairs. Ramps specifically aid individuals with difficulty using their legs or feet, such as wheelchair users, by providing an alternative to stairs, making Choice C the best fit.

[Current Problem]
[Context]: Christmas Day, the birthday of Jesus Christ, is the most important festival in prefix = st1 /Britainand some other countries. On Christmas Eve, people usually tell their children to put their stockings at the end of their beds before they go to sleep. Children believe Santa Claus, with the other name of Father Christmas, will come during the night and fill their stockings with Christmas presents. Actually , Father Christmas is children's father. He dresses up in a red coat and waits until children fall asleep. Then he goes into children's bedrooms, and puts small presents in their stockings. When children are no longer young, they know who Father Christmas really is. Not only children but also their parents enjoy Christmas stockings. They also have stockings. Early on the morning of Christmas Day, children wake their parents up and say"Merry Christmas". Then they help their parents open their stockings. Everybody likes presents. But it is better to give than to receive.
[Question]: Christmas Day is  _  .
[Options]:
A) the birthday of Jesus Christ
B) the only day for giving presents
C) the only day for receiving presents
D) the day for playing games
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 561 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: A new law helps people with disabilities. The law says that people with disabilities must be able to get into and out of all public buildings. It also says that business must offer special services to people who have special needs. Companies can not refuse to hire disabled workers. Many businesses may have to change their buildings and services. --Ramps must be built so people can get into buildings. --Movie theatres must have space for people in wheelchairs and seats for their friends to sit near them. --Elevators must have floor number in  _ . This law will help millions of people. One woman who has been in a wheelchair for many years said, "It is like a dream."
[Question]: Ramps can help people_.
[Options]:
A) with hearing problems
B) who have difficulty in using their hands
C) who have difficulty in using their legs or feet
D) who don't like stairs
Step-by-step reasoning:
Choice C is the correct ans

In [198]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice B is the correct answer because it accurately identifies the young man’s reason for going to the town, as stated in the passage. The passage begins, “A young man went to a town and worked there,” directly indicating that his purpose was to work. Choice B, “to work,” aligns with this explicit statement, reflecting his primary motivation for moving to the town.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.458
ROUGE-1_F1: 0.136
ROUGE-2_F1: 0.000
ROUGE-L_F1: 0.119


In [199]:
cot_prompt = (
    """[Example 1]
[Context]: A new law helps people with disabilities. The law says that people with disabilities must be able to get into and out of all public buildings. It also says that business must offer special services to people who have special needs. Companies can not refuse to hire disabled workers. Many businesses may have to change their buildings and services. --Ramps must be built so people can get into buildings. --Movie theatres must have space for people in wheelchairs and seats for their friends to sit near them. --Elevators must have floor number in  _ . This law will help millions of people. One woman who has been in a wheelchair for many years said, "It is like a dream."
[Question]: Ramps can help people_.
[Options]:
A) with hearing problems
B) who have difficulty in using their hands
C) who have difficulty in using their legs or feet
D) who don't like stairs
Step-by-step reasoning:
Choice C is the correct answer because it accurately identifies who ramps help, as described in the passage. The passage states, “Ramps must be built so people can get into buildings,” in the context of a law ensuring accessibility for people with disabilities, particularly those using wheelchairs. Ramps specifically aid individuals with difficulty using their legs or feet, such as wheelchair users, by providing an alternative to stairs, making Choice C the best fit.

[Current Problem]
[Context]: One day a student went to see his teacher. He had been given an important position and now was coming to say goodbye to his teacher. The old man asked him how he would live among high officials. The student answered, " I will be all right. I have prepared a hundred high hats, one for each official I will meet. I am sure I will succeed." The teacher got angry on hearing this.  "What?" he cried. "Is this what ten years of my teaching has made of you? Nothing but flatterer ?" "Excuse me, honored  master," the student  rose to his feet and apologized hurriedly. "But you have always been absorbed in  your studies and don't know how vulgar  the world has come to be. There are few men in the world who are above flatterers like you." "There is something true in what you said," the teacher nodded, smiling with one of the student's "high hats" on.
[Question]: The teacher  _  .
[Options]:
A) liked to be flattered as well
B) really knew nothing about the vulgar world
C) was in need of a high hat
D) was satisfied with the new hat
Step-by-step reasoning:
"""
)

# Diagnostic
inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
print(f"Input length: {inputs['input_ids'].shape[1]} tokens")

# Generate response
print("Generating reasoning...\n")
response = generate_cot_response(cot_prompt)
print("RESPONSE:\n", response)

Input length: 567 tokens
Generating reasoning...

RESPONSE:
 [Example 1]
[Context]: A new law helps people with disabilities. The law says that people with disabilities must be able to get into and out of all public buildings. It also says that business must offer special services to people who have special needs. Companies can not refuse to hire disabled workers. Many businesses may have to change their buildings and services. --Ramps must be built so people can get into buildings. --Movie theatres must have space for people in wheelchairs and seats for their friends to sit near them. --Elevators must have floor number in  _ . This law will help millions of people. One woman who has been in a wheelchair for many years said, "It is like a dream."
[Question]: Ramps can help people_.
[Options]:
A) with hearing problems
B) who have difficulty in using their hands
C) who have difficulty in using their legs or feet
D) who don't like stairs
Step-by-step reasoning:
Choice C is the correct ans

In [200]:
# Extract generated reasoning for evaluation
generated_cot = response.split("Step-by-step reasoning:")[-1].split("[END]")[0].strip()
reference_cot = ('''Choice A is the correct answer because it accurately reflects the teacher’s reaction, as described in the passage. The passage shows the teacher initially angry at the student’s plan to flatter officials with “high hats,” but when the student flatters him by suggesting he is above flattery, the teacher smiles and nods, “with one of the student’s ‘high hats’ on,” indicating he enjoys the flattery. Choice A, “liked to be flattered as well,” aligns with this, showing the teacher’s acceptance of the student’s flattery despite his initial objection.''')

# Evaluate response
print("\nEVALUATION METRICS:")
metrics = evaluate_response(generated_cot, reference_cot)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")


EVALUATION METRICS:
BERTScore_F1: 0.516
ROUGE-1_F1: 0.260
ROUGE-2_F1: 0.083
ROUGE-L_F1: 0.211
