In [2]:
import pandas as pd
import json
import textstat
from IPython.display import display

## 1. Data Preparations

In [3]:
# open file
with open('processed-reading-questions.json', 'r', encoding="utf8")as file:
    # load as dataframe using json_normalize method
    df = pd.json_normalize(json.load(file))

# set the index to question  id
df.fillna(0, inplace=True)
df.set_index(['questionId'], inplace=True)
df.drop(['vaultid', 'keys', 'origin', 'externalid', 'templateclusterid', 
         'parenttemplatename', 'parenttemplateid', 'templateclustername',
         'updateDate', 'pPcc', 'skill_cd', 'createDate', 'program', 'ibn',
         'external_id', 'primary_class_cd', 'type', 'position', 'uId'
         ], axis=1, inplace=True)


def get_answer_options(answerOptions):
    """
    This function takes a list of dictionaries and returns a list of the values
    """
    options = []
    for i in answerOptions:
        options += [i['content']]
    return options

# Clean up some data
df['answerOptions'] = df['answerOptions'].apply(get_answer_options)
df["correct_answer"] = df["correct_answer"].apply(lambda x: x[0])

# Reorder columns to have a more logical sequence
df = df[['stimulus', 'stem', 'rationale', 'answerOptions', 'correct_answer', 
         'difficulty', 'score_band_range_cd', 'skill_desc', 
         'primary_class_cd_desc']]

# print first 5 records
display(df.head())

Unnamed: 0_level_0,stimulus,stem,rationale,answerOptions,correct_answer,difficulty,score_band_range_cd,skill_desc,primary_class_cd_desc
questionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
f1bfbed3,Marta Coll and colleagues’ 2010 Mediterranean ...,Which choice most logically completes the text?,Choice B is the best answer because it present...,[Coll and colleagues reported a much higher nu...,B,H,7,Inferences,Information and Ideas
29f5c8c2,Some transgenic fish have genes from jellyfish...,Which quotation from a researcher would best s...,Choice D is the best answer because this quota...,[“In one site in the wild where transgenic fis...,D,M,4,Command of Evidence,Information and Ideas
1ba5ad7a,Many literary theorists distinguish between fa...,Which choice best states the main idea of the ...,Choice A is the best answer because it most ac...,[Literary theorist Mikhail Bakhtin argued that...,A,H,6,Central Ideas and Details,Information and Ideas
75e07a4d,To make sure they got the nutrition they neede...,Which choice most effectively uses data from t...,Choice C is the best answer because it most ef...,"[shrimp cocktail for meal B., hot cocoa for me...",C,E,2,Command of Evidence,Information and Ideas
ca5a3fb4,The practice of logging (cutting down trees fo...,Which choice most logically completes the text?,Choice A is the best answer because it most lo...,[logging may be useful for maintaining healthy...,A,M,4,Inferences,Information and Ideas


## 2. Measuring Text Complexity
### Readability

In [4]:
# Calculate the Flesch Reading Ease score
df["flesch_reading_ease"] = df["stimulus"].apply(textstat.flesch_reading_ease)

# Estimated grade level required to understand the text
df["grade_level"] = df["stimulus"].apply(lambda x: textstat.text_standard(x, float_output=True))

# Calculate the McAlpine EFLAW readability score, which is a measure of the readability of a text for non-native English speakers.
# It is recommended to aim for a score equal to or lower than 25.
df["mcalpine_efl"] = df["stimulus"].apply(textstat.mcalpine_eflaw)

# Calculate the reading time in seconds. This is based on the average reading speed of 238 WPM.
"""
Marc Brysbaert,
How many words do we read per minute? A review and meta-analysis of reading rate,
Journal of Memory and Language,
Volume 109,
2019,
104047,
ISSN 0749-596X,
https://doi.org/10.1016/j.jml.2019.104047.
"""
df["reading_time_passage_only"] = df["stimulus"].apply(lambda x: textstat.reading_time(x, ms_per_char=52.5210084))

# Calculate the reading time of the whole question, including the stem and answer options
df["reading_time_whole_question"] = df.apply(lambda x: textstat.reading_time(x["stimulus"] + x["stem"] + ' '.join(x["answerOptions"]), ms_per_char=52.5210084), axis=1)

# print first 5 records
display(df.head())

Unnamed: 0_level_0,stimulus,stem,rationale,answerOptions,correct_answer,difficulty,score_band_range_cd,skill_desc,primary_class_cd_desc,flesch_reading_ease,grade_level,mcalpine_efl,reading_time_passage_only,reading_time_whole_question
questionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
f1bfbed3,Marta Coll and colleagues’ 2010 Mediterranean ...,Which choice most logically completes the text?,Choice B is the best answer because it present...,[Coll and colleagues reported a much higher nu...,B,H,7,Inferences,Information and Ideas,21.74,19.0,31.0,32.56,68.01
29f5c8c2,Some transgenic fish have genes from jellyfish...,Which quotation from a researcher would best s...,Choice D is the best answer because this quota...,[“In one site in the wild where transgenic fis...,D,M,4,Command of Evidence,Information and Ideas,60.69,10.0,36.3,20.64,54.2
1ba5ad7a,Many literary theorists distinguish between fa...,Which choice best states the main idea of the ...,Choice A is the best answer because it most ac...,[Literary theorist Mikhail Bakhtin argued that...,A,H,6,Central Ideas and Details,Information and Ideas,22.08,21.0,45.3,28.26,63.03
75e07a4d,To make sure they got the nutrition they neede...,Which choice most effectively uses data from t...,Choice C is the best answer because it most ef...,"[shrimp cocktail for meal B., hot cocoa for me...",C,E,2,Command of Evidence,Information and Ideas,67.93,11.0,40.0,13.39,21.95
ca5a3fb4,The practice of logging (cutting down trees fo...,Which choice most logically completes the text?,Choice A is the best answer because it most lo...,[logging may be useful for maintaining healthy...,A,M,4,Inferences,Information and Ideas,56.59,13.0,29.8,25.53,42.28


## 3. Cognitive Load of the Questions
### Information Retrieval
Count how many pieces of information a student must retrieve from the text to answer the question.

In [5]:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import numpy as np
from sentence_transformers import SentenceTransformer   
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
nlp = spacy.load("en_core_web_sm")
spacy.prefer_gpu()
model = SentenceTransformer('all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NITRO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:

def calculate_reasoning_steps_sat(question, text, answer_choices, correct_answer_index, sim_threshold=0.1):
    """
    Estimates the number of reasoning steps required to answer an SAT-style multiple-choice question.
    
    - Identifies key sentences supporting the correct answer.
    - Builds a reasoning chain to measure how much reasoning is needed.
    - Compares distractors to measure overall question difficulty.

    Args:
        question (str): The question text.
        text (str): The passage containing relevant information.
        answer_choices (list of str): A list of possible answer choices.
        correct_answer_index (int): Index of the correct answer in answer_choices.
        sim_threshold (float): Similarity threshold for considering a sentence relevant.

    Returns:
        dict: {
            "reasoning_steps": int,  # Steps needed to justify the correct answer
            "key_sentences": list,   # Sentences used for reasoning
            "distractor_complexity": float,  # How misleading the wrong answers are
        }
    """

    # 1. Sentence Tokenization
    sentences = sent_tokenize(text)

    # 2. Named Entity Recognition (NER) to Identify Key Sentences
    key_sentences = []
    for sent in sentences:
        doc = nlp(sent)
        if any(ent.label_ in ["PERSON", "ORG", "GPE", "DATE"] for ent in doc.ents):  
            key_sentences.append(sent)

    # 3. Compute Sentence Embeddings
    question_embedding = model.encode(question)
    sentence_embeddings = model.encode(sentences)
    answer_embeddings = model.encode(answer_choices)
    correct_answer_embedding = answer_embeddings[correct_answer_index]

    # 3.1. Identify the Most Relevant Starting Sentence
    # Find the sentence in the passage that is most similar to the question
    question_sim_scores = cosine_similarity([question_embedding], sentence_embeddings)[0]
    most_relevant_starting_index = np.argmax(question_sim_scores)  # Start from the most relevant sentence

    # 4. Build Similarity Graph
    sim_scores = cosine_similarity(sentence_embeddings)
    G = nx.Graph()
    for i in range(len(sentences)):
        for j in range(i+1, len(sentences)):
            if sim_scores[i][j] > sim_threshold:
                G.add_edge(i, j, weight=sim_scores[i][j])

    # 4.1 Ensure the Starting Sentence is in the Graph
    if most_relevant_starting_index not in G:
        # Find the closest related sentence and manually link it
        closest_related_index = np.argmax(sim_scores[most_relevant_starting_index])
        G.add_edge(most_relevant_starting_index, closest_related_index, weight=sim_scores[most_relevant_starting_index][closest_related_index])

    # 5. Find the Shortest Reasoning Chain for the Correct Answer
    relevant_nodes = []
    for i, sent in enumerate(sentences):
        if cosine_similarity([correct_answer_embedding], [sentence_embeddings[i]])[0][0] > sim_threshold and i in G.nodes():
            relevant_nodes.append(i)

    reasoning_steps = []
    for node in relevant_nodes:
        try:
            path_length = nx.shortest_path_length(G, source=most_relevant_starting_index, target=node)
            reasoning_steps.append(path_length)
        except nx.NetworkXNoPath:
            pass

    # 6. Compute Distractor Complexity (How Misleading the Wrong Answers Are)
    distractor_complexity = []
    for i, ans_embedding in enumerate(answer_embeddings):
        if i == correct_answer_index:
            continue  # Ignore the correct answer
        ans_sim_scores = cosine_similarity([ans_embedding], sentence_embeddings)[0]
        avg_distractor_similarity = np.mean([score for score in ans_sim_scores if score > 0.2])
        distractor_complexity.append(avg_distractor_similarity)

    # Final Estimation
    result = {
        "reasoning_steps": max(reasoning_steps) if reasoning_steps else 0,
        "key_sentences": key_sentences,
        "distractor_complexity": np.mean(distractor_complexity) if distractor_complexity else 0.0,
    }
    return result

In [15]:
# Example usage
choices = ["A", "B", "C", "D"]
def row_operation(row):
    try:
        question = row['stem']
        text = row['stimulus']
        answer_choices = row['answerOptions']
        correct_answer_index = choices.index(row['correct_answer'])
        res = calculate_reasoning_steps_sat(question, text, answer_choices, correct_answer_index)
        row["reasoning_steps"] = res['reasoning_steps']
        row["key_sentences"] = res['key_sentences']
        row["distractor_complexity"] = res['distractor_complexity']
        return row
    except:
        row["reasoning_steps"] = -1
        row["key_sentences"] = []
        row["distractor_complexity"] = -1
        return row

df = df.apply(row_operation, axis=1)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=

In [18]:
display(df.head())
df.to_csv('processed-reading-questions-with-readability.csv', header=True, index=True, encoding='utf-8')

Unnamed: 0_level_0,stimulus,stem,rationale,answerOptions,correct_answer,difficulty,score_band_range_cd,skill_desc,primary_class_cd_desc,flesch_reading_ease,grade_level,mcalpine_efl,reading_time_passage_only,reading_time_whole_question,reasoning_steps,key_sentences,distractor_complexity
questionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
f1bfbed3,Marta Coll and colleagues’ 2010 Mediterranean ...,Which choice most logically completes the text?,Choice B is the best answer because it present...,[Coll and colleagues reported a much higher nu...,B,H,7,Inferences,Information and Ideas,21.74,19.0,31.0,32.56,68.01,1,[Marta Coll and colleagues’ 2010 Mediterranean...,0.506046
29f5c8c2,Some transgenic fish have genes from jellyfish...,Which quotation from a researcher would best s...,Choice D is the best answer because this quota...,[“In one site in the wild where transgenic fis...,D,M,4,Command of Evidence,Information and Ideas,60.69,10.0,36.3,20.64,54.2,1,[Although these fish were initially engineered...,0.539293
1ba5ad7a,Many literary theorists distinguish between fa...,Which choice best states the main idea of the ...,Choice A is the best answer because it most ac...,[Literary theorist Mikhail Bakhtin argued that...,A,H,6,Central Ideas and Details,Information and Ideas,22.08,21.0,45.3,28.26,63.03,1,"[In the film The Godfather Part II, the fabula...",0.506688
75e07a4d,To make sure they got the nutrition they neede...,Which choice most effectively uses data from t...,Choice C is the best answer because it most ef...,"[shrimp cocktail for meal B., hot cocoa for me...",C,E,2,Command of Evidence,Information and Ideas,67.93,11.0,40.0,13.39,21.95,1,[To make sure they got the nutrition they need...,0.415527
ca5a3fb4,The practice of logging (cutting down trees fo...,Which choice most logically completes the text?,Choice A is the best answer because it most lo...,[logging may be useful for maintaining healthy...,A,M,4,Inferences,Information and Ideas,56.59,13.0,29.8,25.53,42.28,1,"[However, a massive study in forest management...",0.389475


## 4. Discourse Complexity
### Cohesion
Evaluate how easy it is to track ideas in the passage.
TO BE IMPLEMENTED 

In [None]:
from neuralcoref import Coref