In [2]:
import pandas as pd
import json
import textstat
from IPython.display import display

## 1. Data Preparations

In [12]:
# open file
test_7_m1 = pd.read_json('new-tests/test_7_m1.json')
test_7_m2 = pd.read_json('new-tests/test_7_m2_hard.json')
test_8 = pd.read_json('new-tests/test_8_hard.json')
test_9 = pd.read_json('new-tests/test_9_hard.json')
test_10 = pd.read_json('new-tests/test_10_hard.json')

# concatenate all the dataframes
df = pd.concat([test_7_m1, test_7_m2, test_8, test_9, test_10], ignore_index=True)

# rename columns to match the rest of the code
df = df.rename(columns={'Stimulus': 'stimulus', 'Stem': 'stem', 'correct_answers': 'correct_answer'})
# convert every \n to a space in columns 'stimulus' and 'stem'
df['stimulus'] = df['stimulus'].str.replace('\n', ' ')
df['stem'] = df['stem'].str.replace('\n', ' ')

# also convert strings in each of the answer options
df['answerOptions'] = df['answerOptions'].apply(lambda x: [opt.replace('\n', ' ') for opt in x])

# print first 5 records
display(df.head())

Unnamed: 0,stimulus,stem,answerOptions,correct_answer
0,Botanist Al Kovaleski has pointed out that map...,Which choice completes the text with the most ...,"[relocate from, refer to, originate from, adap...",D
1,The following text is from John Muir’s 1913 au...,"As used in the text, what does the word ""clear...","[Simple, Understandable, Obvious, Transparent]",D
2,The recently observed gamma ray burst GRB 2303...,Which choice completes the text with the most ...,"[a coincidence, areprieve, an incident, an odd...",D
3,"In 1776, the United States sent Benjamin Frank...",Which choice completes the text with the most ...,"[thoughtfulness, esteem., controversy, sincerity]",B
4,"In the 1950s, scientists didn’t know much abou...",Which choice best describes the function of th...,[It identifies a scientific belief that Tharp ...,A


## 2. Measuring Text Complexity
### Readability

In [13]:
# Calculate the Flesch Reading Ease score
df["flesch_reading_ease"] = df["stimulus"].apply(textstat.flesch_reading_ease)

# Estimated grade level required to understand the text
df["grade_level"] = df["stimulus"].apply(lambda x: textstat.text_standard(x, float_output=True))

# Calculate the McAlpine EFLAW readability score, which is a measure of the readability of a text for non-native English speakers.
# It is recommended to aim for a score equal to or lower than 25.
df["mcalpine_efl"] = df["stimulus"].apply(textstat.mcalpine_eflaw)

# Calculate the reading time in seconds. This is based on the average reading speed of 238 WPM.
"""
Marc Brysbaert,
How many words do we read per minute? A review and meta-analysis of reading rate,
Journal of Memory and Language,
Volume 109,
2019,
104047,
ISSN 0749-596X,
https://doi.org/10.1016/j.jml.2019.104047.
"""
df["reading_time_passage_only"] = df["stimulus"].apply(lambda x: textstat.reading_time(x, ms_per_char=52.5210084))

# Calculate the reading time of the whole question, including the stem and answer options
df["reading_time_whole_question"] = df.apply(lambda x: textstat.reading_time(x["stimulus"] + x["stem"] + ' '.join(x["answerOptions"]), ms_per_char=52.5210084), axis=1)

# print first 5 records
display(df.head())

Unnamed: 0,stimulus,stem,answerOptions,correct_answer,flesch_reading_ease,grade_level,mcalpine_efl,reading_time_passage_only,reading_time_whole_question
0,Botanist Al Kovaleski has pointed out that map...,Which choice completes the text with the most ...,"[relocate from, refer to, originate from, adap...",D,54.56,12.0,34.0,13.08,18.7
1,The following text is from John Muir’s 1913 au...,"As used in the text, what does the word ""clear...","[Simple, Understandable, Obvious, Transparent]",D,60.65,10.0,27.3,14.13,18.91
2,The recently observed gamma ray burst GRB 2303...,Which choice completes the text with the most ...,"[a coincidence, areprieve, an incident, an odd...",D,55.24,10.0,21.0,8.67,14.29
3,"In 1776, the United States sent Benjamin Frank...",Which choice completes the text with the most ...,"[thoughtfulness, esteem., controversy, sincerity]",B,81.63,8.0,20.7,11.76,17.49
4,"In the 1950s, scientists didn’t know much abou...",Which choice best describes the function of th...,[It identifies a scientific belief that Tharp ...,A,81.12,6.0,19.5,16.65,31.72


## 3. Cognitive Load of the Questions
### Information Retrieval
Count how many pieces of information a student must retrieve from the text to answer the question.

In [8]:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import numpy as np
from sentence_transformers import SentenceTransformer   
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
nlp = spacy.load("en_core_web_sm")
spacy.prefer_gpu()
model = SentenceTransformer('all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NITRO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:

def calculate_reasoning_steps_sat(question, text, answer_choices, correct_answer_index, sim_threshold=0.1):
    """
    Estimates the number of reasoning steps required to answer an SAT-style multiple-choice question.
    
    - Identifies key sentences supporting the correct answer.
    - Builds a reasoning chain to measure how much reasoning is needed.
    - Compares distractors to measure overall question difficulty.

    Args:
        question (str): The question text.
        text (str): The passage containing relevant information.
        answer_choices (list of str): A list of possible answer choices.
        correct_answer_index (int): Index of the correct answer in answer_choices.
        sim_threshold (float): Similarity threshold for considering a sentence relevant.

    Returns:
        dict: {
            "reasoning_steps": int,  # Steps needed to justify the correct answer
            "key_sentences": list,   # Sentences used for reasoning
            "distractor_complexity": float,  # How misleading the wrong answers are
        }
    """

    # 1. Sentence Tokenization
    sentences = sent_tokenize(text)

    # 2. Named Entity Recognition (NER) to Identify Key Sentences
    key_sentences = []
    for sent in sentences:
        doc = nlp(sent)
        if any(ent.label_ in ["PERSON", "ORG", "GPE", "DATE"] for ent in doc.ents):  
            key_sentences.append(sent)

    # 3. Compute Sentence Embeddings
    question_embedding = model.encode(question)
    sentence_embeddings = model.encode(sentences)
    answer_embeddings = model.encode(answer_choices)
    correct_answer_embedding = answer_embeddings[correct_answer_index]

    # 3.1. Identify the Most Relevant Starting Sentence
    # Find the sentence in the passage that is most similar to the question
    question_sim_scores = cosine_similarity([question_embedding], sentence_embeddings)[0]
    most_relevant_starting_index = np.argmax(question_sim_scores)  # Start from the most relevant sentence

    # 4. Build Similarity Graph
    sim_scores = cosine_similarity(sentence_embeddings)
    G = nx.Graph()
    for i in range(len(sentences)):
        for j in range(i+1, len(sentences)):
            if sim_scores[i][j] > sim_threshold:
                G.add_edge(i, j, weight=sim_scores[i][j])

    # 4.1 Ensure the Starting Sentence is in the Graph
    if most_relevant_starting_index not in G:
        # Find the closest related sentence and manually link it
        closest_related_index = np.argmax(sim_scores[most_relevant_starting_index])
        G.add_edge(most_relevant_starting_index, closest_related_index, weight=sim_scores[most_relevant_starting_index][closest_related_index])

    # 5. Find the Shortest Reasoning Chain for the Correct Answer
    relevant_nodes = []
    for i, sent in enumerate(sentences):
        if cosine_similarity([correct_answer_embedding], [sentence_embeddings[i]])[0][0] > sim_threshold and i in G.nodes():
            relevant_nodes.append(i)

    reasoning_steps = []
    for node in relevant_nodes:
        try:
            path_length = nx.shortest_path_length(G, source=most_relevant_starting_index, target=node)
            reasoning_steps.append(path_length)
        except nx.NetworkXNoPath:
            pass

    # 6. Compute Distractor Complexity (How Misleading the Wrong Answers Are)
    distractor_complexity = []
    for i, ans_embedding in enumerate(answer_embeddings):
        if i == correct_answer_index:
            continue  # Ignore the correct answer
        ans_sim_scores = cosine_similarity([ans_embedding], sentence_embeddings)[0]
        avg_distractor_similarity = np.mean([score for score in ans_sim_scores if score > 0.2])
        distractor_complexity.append(avg_distractor_similarity)

    # Final Estimation
    result = {
        "reasoning_steps": max(reasoning_steps) if reasoning_steps else 0,
        "key_sentences": key_sentences,
        "distractor_complexity": np.mean(distractor_complexity) if distractor_complexity else 0.0,
    }
    return result

In [15]:
# Example usage
choices = ["A", "B", "C", "D"]
def row_operation(row):
    try:
        question = row['stem']
        text = row['stimulus']
        answer_choices = row['answerOptions']
        correct_answer_index = choices.index(row['correct_answer'])
        res = calculate_reasoning_steps_sat(question, text, answer_choices, correct_answer_index)
        row["reasoning_steps"] = res['reasoning_steps']
        row["key_sentences"] = res['key_sentences']
        row["distractor_complexity"] = res['distractor_complexity']
        return row
    except:
        row["reasoning_steps"] = -1
        row["key_sentences"] = []
        row["distractor_complexity"] = -1
        return row

df = df.apply(row_operation, axis=1)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=

In [16]:
display(df.head())
df.to_csv('processed_new_test_questions.csv', header=True, index=True, encoding='utf-8')

Unnamed: 0,stimulus,stem,answerOptions,correct_answer,flesch_reading_ease,grade_level,mcalpine_efl,reading_time_passage_only,reading_time_whole_question,reasoning_steps,key_sentences,distractor_complexity
0,Botanist Al Kovaleski has pointed out that map...,Which choice completes the text with the most ...,"[relocate from, refer to, originate from, adap...",D,54.56,12.0,34.0,13.08,18.7,1,[Botanist Al Kovaleski has pointed out that ma...,
1,The following text is from John Muir’s 1913 au...,"As used in the text, what does the word ""clear...","[Simple, Understandable, Obvious, Transparent]",D,60.65,10.0,27.3,14.13,18.91,1,[The following text is from John Muir’s 1913 a...,
2,The recently observed gamma ray burst GRB 2303...,Which choice completes the text with the most ...,"[a coincidence, areprieve, an incident, an odd...",D,55.24,10.0,21.0,8.67,14.29,0,[The recently observed gamma ray burst GRB 230...,
3,"In 1776, the United States sent Benjamin Frank...",Which choice completes the text with the most ...,"[thoughtfulness, esteem., controversy, sincerity]",B,81.63,8.0,20.7,11.76,17.49,0,"[In 1776, the United States sent Benjamin Fran...",
4,"In the 1950s, scientists didn’t know much abou...",Which choice best describes the function of th...,[It identifies a scientific belief that Tharp ...,A,81.12,6.0,19.5,16.65,31.72,1,"[In the 1950s, scientists didn’t know much abo...",0.42012


## 4. Discourse Complexity
### Cohesion
Evaluate how easy it is to track ideas in the passage.
TO BE IMPLEMENTED 

In [None]:
from neuralcoref import Coref