In [1]:
# Import necessary libraries
import pickle as pkl
import random
from copy import deepcopy
import time
import json
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

# Import project modules
import importlib
import lm_proposer
import D5

importlib.reload(lm_proposer)
importlib.reload(D5)
from lm_proposer import GPT3_Proposer
from D5 import D5
from validator import DummyValidator, Validator
from get_representative import return_extreme_values
from textDiff.data_utils import SimSEDataLoader

# Set up configuration
USE_DUMMY_VERIFIER = False  # Set to False to use the actual T5 validator
FIND_REPRESENTATIVE = False  # Set to True to find representative samples (takes longer)
SUBSAMPLE_SIZE = 1000  # Number of samples to use for faster execution
# VERIFIER_NAME = 'dummy'  # TODO: change to 'ruiqi-zhong/d5_t5_validator' or 'ruiqi-zhong/d5_t5_validator_3B' later
# VERIFIER_NAME = 'ruiqi-zhong/d5_t5_validator_3B'
VERIFIER_NAME = 'gpt'

# Subsample for faster execution
def subsample(samples, n=100):
    selected_idxes = list(range(len(samples)))
    random.shuffle(selected_idxes)
    selected_idxes = selected_idxes[:n]
    return [samples[i] for i in sorted(selected_idxes)]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the educational dataset
print("Loading dataset...")
dataloader = SimSEDataLoader()
full_datasets = dataloader.full_datasets
split_datasets = dataloader.split_datasets

Loading dataset...
Missing 02_F22_003_030_P4 in the rating file
Missing 02_F22_001_041_P4 in the rating file
Missing 02_F22_001_041_P6 in the rating file
Missing 02_F22_003_047_P3 in the rating file
Missing 02_F22_003_030_P5 in the rating file
Missing 02_F22_001_041_P5 in the rating file
Missing 02_S23_006_062_P2 in the rating file
Missing 03_S23_002_102_P1 in the rating file
Missing 03_S23_002_102_P2 in the rating file
Missing 03_S23_002_102_P3 in the rating file
Missing 03_S23_004_124_P3 in the rating file
Missing 01_S22_001_007_P3 in the rating file
Missing 01_S22_001_007_P2 in the rating file
Missing 01_S22_001_020_P3 in the rating file
Missing 01_S22_001_007_P1 in the rating file
Missing 01_S22_001_020_P2 in the rating file
Missing 01_S22_001_020_P1 in the rating file
train 545
dev 181
test 181


In [3]:
# Create the problem definition
problem = {
    # 'generation': 'teaching samples from the treatment group and control group',
    'generation': 'teaching samples from the treatment group and control group, where the treatment group is '
                  'when the teachers are coached to use a metacognitive modeling strategy and metacognitive '
                  'modeling is defined as thinking aloud about thinking in order to make a strategy, task, or '
                  'process more accessible to students',
    'dataset_description': 'classroom transcripts for teaching math word problems',
    'target': 'what teaching strategy is more frequent in the treatment group than the control group',
    'user': 'an education researcher',
    'A_desc': 'teaching samples in the treatment group',
    'B_desc': 'teaching samples in the control group',
    'example_hypotheses': [],
    'split': {
        'research': {
            'A_samples': [ele['text'] for ele in split_datasets['train'] if ele['condition'] == 'Treatment'],
            'B_samples': [ele['text'] for ele in split_datasets['train'] if ele['condition'] == 'Non-Experimental']
        },
        'validation': {
            'A_samples': [ele['text'] for ele in split_datasets['dev'] + split_datasets['test'] if
                          ele['condition'] == 'Treatment'],
            'B_samples': [ele['text'] for ele in split_datasets['dev'] + split_datasets['test'] if
                          ele['condition'] == 'Control']
        }
    }
}

In [4]:
print(f"Original sample sizes: A={len(problem['split']['research']['A_samples'])}, B={len(problem['split']['research']['B_samples'])}")
problem['split']['research']['A_samples'] = subsample(problem['split']['research']['A_samples'], SUBSAMPLE_SIZE)
problem['split']['research']['B_samples'] = subsample(problem['split']['research']['B_samples'], SUBSAMPLE_SIZE)
print(f"Subsampled sizes: A={len(problem['split']['research']['A_samples'])}, B={len(problem['split']['research']['B_samples'])}")

print(f"Validation sample sizes: A={len(problem['split']['validation']['A_samples'])}, B={len(problem['split']['validation']['B_samples'])}")

Original sample sizes: A=200, B=152
Subsampled sizes: A=200, B=152
Validation sample sizes: A=146, B=113


In [5]:
# Create the validator
if VERIFIER_NAME == "dummy":
    print("\nUsing dummy validator (returns random scores)")
    verifier = DummyValidator()
elif VERIFIER_NAME == "gpt":
    from validator import GPTValidator
    verifier = GPTValidator()
else:
    print(f"\nUsing T5 validator: {VERIFIER_NAME}")
    verifier = Validator(VERIFIER_NAME, batch_size=32)

In [6]:
  # creating the proposer and verifier
proposer = GPT3_Proposer(problem)

In [7]:
# Create and run the D5 algorithm
print("\nRunning D5 algorithm...")
d5 = D5(
    problem['split']['research']['A_samples'],
    problem['split']['research']['B_samples'],
    verifier,
    proposer,
    total_hypotheses_count=20,
    early_stop=True
)

# Run the algorithm
h2h_dicts = d5.run()


Running D5 algorithm...


Token indices sequence length is longer than the specified maximum sequence length for this model (20795 > 1024). Running this sequence through the model will result in indexing errors


Using 25 in-context samples, prompt length: 20795
Group A: Hi, today we will be working with this word problem, CeCe is making cookies with white and brown sugar. She puts two eighths cups of white sugar in the cookie dough, then she puts in the brown sugar. She has seven eighths cups of sugar and all how much brown sugar is in the recipe. So here we know that we have to eight cups of white sugar. The unknown is the amount of brown sugar. But we do know how much sugar is in all. So we would set up this word problem as seven eight cups of sugar minus two eight cups of white sugar equals the unknown amount of brown sugar. So we know that our denominators which is the bottom eight is the same so we are able to subtract easily. So here I've drawn one circle equals 1878 equals seven circles, and then two eighths equals two circles. So following our subtraction problem here, I would take away two of the circles from the seven eighths and that will leave us with five circles since one circle 

Num hypotheses: 20: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 352/352 [37:02<00:00,  6.31s/it]


In [10]:
h_sorted = sorted(h2h_dicts, key=lambda h: h2h_dicts[h]['diff_w_significance']['mu'], reverse=True)

pkl.dump(h2h_dicts, open("results/simse_Treatment_non_experimental_v3.pkl", 'wb'))
results = {'hypothesis': [], 'V': [], 'p_value': []}
for h in h_sorted:
    h_dict = h2h_dicts[h]
    # print out the example hypothesis along with their V' score
    print(h_dict['hypothesis'], 'V\'', h_dict['diff_w_significance']['mu'], 'p-value', h_dict['diff_w_significance']['p_value'])
    results['hypothesis'].append(h_dict['hypothesis'])
    results['V'].append(h_dict['diff_w_significance']['mu'])
    results['p_value'].append(h_dict['diff_w_significance']['p_value'])

df = pd.DataFrame(results)
df.to_csv("results/simse_Treatment_non_experimental_v3.csv", index=False)

explicitly articulate their thought processes while solving the problem V' 0.14894750206584373 p-value 8.10365612399197e-06
engage in verbalizing mistakes and the subsequent correction process to model error detection and resolution V' 0.12657862015953897 p-value 0.00887862731389424
provide step-by-step explanations when approaching word problems V' 0.09947393930381565 p-value 0.0014072176496624168
illustrate and model problem-solving steps using detailed breakdowns and explanations V' 0.07789475616708819 p-value 0.011266216497281404
emphasize checking the work and verifying the solution V' 0.06868432468663471 p-value 0.05227144041951565
provide step-by-step explanations of mathematical reasoning, making the cognitive process visible V' 0.06499961412135391 p-value 0.016551898683617325
- use a metacognitive modeling strategy, where the instructor verbalizes their thought process V' 0.056841811310487156 p-value 0.09568914773604253
employ explicit think-aloud practices to demonstrate prob

In [12]:
print(h2h_dicts.keys())
print(h2h_dicts[h_sorted[0]].keys())
print(h2h_dicts[h_sorted[0]]['sample2score'][problem['split']['research']['A_samples'][1]])

def create_hypothesis_sample_dataframe(h2h_dicts):
    """
    Create a dataframe with hypothesis-sample scores and truth values,
    correctly extracting corpus membership from h2h_dicts.

    Args:
        h2h_dicts: Dictionary from D5 containing hypothesis evaluation results

    Returns:
        DataFrame with hypothesis, sample, score, corpus, and is_true columns
    """
    # Initialize lists to collect data
    data = []

    # For each hypothesis
    for h, h_dict in h2h_dicts.items():
        # Short version of hypothesis for display
        h_short = h[:50] + "..." if len(h) > 50 else h

        # Get sample scores for this hypothesis
        for sample, score in h_dict['sample2score'].items():
            # Get the corpus information directly from sample2corpus if available
            corpus = h_dict['sample2corpus'][sample]

            # Determine if hypothesis is true for this sample (using 0.5 threshold)
            is_true = score > 0.5

            # Add row to our data
            data.append({
                'hypothesis': h,
                'hypothesis_short': h_short,
                'sample': sample,
                'sample_short': sample[:100] + "..." if len(sample) > 100 else sample,
                'score': score,
                'corpus': corpus,
                'is_true': is_true
            })

    # Create dataframe
    df = pd.DataFrame(data)

    return df


score_df = create_hypothesis_sample_dataframe(h2h_dicts)
score_df.to_csv("results_test/scores_simse_Treatment_non_experimental_v3.csv", index=False)


dict_keys(['- use metacognitive modeling by thinking aloud about the steps to solve a problem', 'explicitly articulate their thought processes while solving the problem', 'provide step-by-step explanations when approaching word problems', 'engage students in breaking down the problem into manageable parts', 'emphasize checking the work and verifying the solution', 'incorporate the strategy of drawing visual representations (e.g., number lines, diagrams) to aid understanding', 'emphasize identifying important information within the word problem', 'highlight the importance of understanding the context and the components of the problem before solving', 'encourage students to rephrase the question to ensure comprehension', 'suggest estimating and rounding as strategies for problem-solving', 'focus on the relationship between parts and the whole in fraction problems', 'use questioning to guide students through the reasoning process', 'emphasize making connections between mathematical concep

In [22]:
import completeness

importlib.reload(completeness)
from completeness import calculate_completeness

llm_completeness_results, completed_h2h_dicts = calculate_completeness(
        h2h_dicts,
        problem,
        nonparam_method="llm",
        verifier_name="gpt",
        verbose=True
    )

pkl.dump(completed_h2h_dicts, open("results/completed_results_simse_Treatment_non_experimental_v3.pkl", 'wb'))

print("\nCompleteness Results (LLM with improved prompt):")
print(f"Completeness: {llm_completeness_results['completeness']:.4f}")
print(f"Trivial Predictor Accuracy: {llm_completeness_results['trivial_accuracy']:.4f}")
print(f"Theme-based Predictor Accuracy: {llm_completeness_results['theme_accuracy']:.4f}")
print(f"Non-parametric Benchmark Accuracy: {llm_completeness_results['nonparam_accuracy']:.4f}")

# Save completeness results to CSV
llm_completeness_df = pd.DataFrame({
    'metric': ['completeness', 'trivial_accuracy', 'theme_accuracy', 'nonparam_accuracy'],
    'value': [
        llm_completeness_results['completeness'],
        llm_completeness_results['trivial_accuracy'],
        llm_completeness_results['theme_accuracy'],
        llm_completeness_results['nonparam_accuracy']
    ]
})
llm_completeness_df.to_csv("results/completeness_llm_improved_prompt.csv", index=False)

Pre-validating validation samples for all themes...
Checking which samples need validation...
Validating 5180 missing sample-theme pairs...
Processing batch 1 of 518
Processing batch 2 of 518
Processing batch 3 of 518
Processing batch 4 of 518
Processing batch 5 of 518
Processing batch 6 of 518
Processing batch 7 of 518
Processing batch 8 of 518
Processing batch 9 of 518
Processing batch 10 of 518
Processing batch 11 of 518
Processing batch 12 of 518
Processing batch 13 of 518
Processing batch 14 of 518
Processing batch 15 of 518
Processing batch 16 of 518
Processing batch 17 of 518
Processing batch 18 of 518
Processing batch 19 of 518
Processing batch 20 of 518
Processing batch 21 of 518
Processing batch 22 of 518
Processing batch 23 of 518
Processing batch 24 of 518
Processing batch 25 of 518
Processing batch 26 of 518
Processing batch 27 of 518
Processing batch 28 of 518
Processing batch 29 of 518
Processing batch 30 of 518
Processing batch 31 of 518
Processing batch 32 of 518
Proce

In [26]:
# Add Roberta-based completeness calculation here
importlib.reload(completeness)
from completeness import calculate_completeness, GPTPredictor, load_prediction_results

# Load pre-computed RoBERTa-based prediction results from file
# This assumes you've already run get_representative.py's simse_run function
# which saves prediction results to "results/simse_validation_predictions.pkl"
print("\nLoading pre-computed RoBERTa-based prediction results...")
try:
    roberta_scores = load_prediction_results("results/simse_validation_predictions.pkl")
    print(f"Loaded {len(roberta_scores)} prediction scores")

    print("\nCalculating completeness using pre-computed RoBERTa prediction scores...")
    completeness_results_roberta, _ = calculate_completeness(
        completed_h2h_dicts,
        problem,
        nonparam_method="roberta",  # This is still needed but won't be used for prediction
        verifier_name="gpt",        # This is still needed but won't be used for prediction
        verbose=True,
        nonparam_scores=roberta_scores  # Pass the pre-computed scores
    )

    print("\nCompleteness Results (using RoBERTa-based scores):")
    print(f"Completeness: {completeness_results_roberta['completeness']:.4f}")
    print(f"Trivial Predictor Accuracy: {completeness_results_roberta['trivial_accuracy']:.4f}")
    print(f"Theme-based Predictor Accuracy: {completeness_results_roberta['theme_accuracy']:.4f}")
    print(f"Non-parametric Benchmark Accuracy: {completeness_results_roberta['nonparam_accuracy']:.4f}")

    # Save completeness results to CSV
    roberta_completeness_df = pd.DataFrame({
        'metric': ['completeness', 'trivial_accuracy', 'theme_accuracy', 'nonparam_accuracy'],
        'value': [
            completeness_results_roberta['completeness'],
            completeness_results_roberta['trivial_accuracy'],
            completeness_results_roberta['theme_accuracy'],
            completeness_results_roberta['nonparam_accuracy']
        ]
    })
    roberta_completeness_df.to_csv("results/completeness_roberta_scores.csv", index=False)
except FileNotFoundError:
    print("RoBERTa prediction file not found. Run get_representative.py's simse_run function first.")
    print("Example: from get_representative import simse_run; simse_run(problem=problem, output_file='results/simse_validation_predictions.pkl')")
except Exception as e:
    print(f"Error loading RoBERTa predictions: {e}")



Loading pre-computed RoBERTa-based prediction results...
Loaded 259 prediction scores

Calculating completeness using pre-computed RoBERTa prediction scores...
Pre-validating validation samples for all themes...
Checking which samples need validation...
All samples already have scores for all themes.
Logistic Regression Coefficients:
Theme: explicitly articulate their thought processes while solving the problem, Coefficient: 1.444779770091233
Theme: engage in verbalizing mistakes and the subsequent correction process to model error detection and resolution, Coefficient: 0.5147821944941557
Theme: provide step-by-step explanations when approaching word problems, Coefficient: 0.7202083269038593
Theme: illustrate and model problem-solving steps using detailed breakdowns and explanations, Coefficient: 0.16406911237739472
Theme: emphasize checking the work and verifying the solution, Coefficient: 0.2394045533464628
Theme: provide step-by-step explanations of mathematical reasoning, making t

In [14]:
# Temp implementation

# Calculate completeness of themes in describing group differences
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def calculate_completeness(h2h_dicts, validation_data, problem):
    """
    Calculate the completeness of themes in describing group differences.

    Args:
        h2h_dicts: Dictionary from D5 containing hypothesis evaluation results
        validation_data: Dictionary containing validation data
        problem: Problem definition dictionary

    Returns:
        float: Completeness measure between 0 and 1
    """
    # Extract validation samples
    A_validation = problem['split']['validation']['A_samples']
    B_validation = problem['split']['validation']['B_samples']

    # Combine validation samples and create labels
    validation_samples = A_validation + B_validation
    validation_labels = [1] * len(A_validation) + [0] * len(B_validation)

    # 1. Trivial predictor (constant prediction)
    # Calculate the majority class
    majority_class = 1 if sum(validation_labels) > len(validation_labels) / 2 else 0
    trivial_predictions = [majority_class] * len(validation_labels)
    trivial_loss = 1 - accuracy_score(validation_labels, trivial_predictions)

    # 2. Theme-based predictor (logistic regression on theme scores)
    # Get the top hypotheses (themes)
    h_sorted = sorted(h2h_dicts, key=lambda h: h2h_dicts[h]['diff_w_significance']['mu'], reverse=True)
    top_themes = h_sorted[:20]  # Use top 20 themes

    # Create feature matrix for validation samples
    X_theme = []
    for sample in validation_samples:
        sample_features = []
        for theme in top_themes:
            # If the sample has a score for this theme, use it
            # Otherwise, use 0.5 as a neutral score
            if sample in h2h_dicts[theme]['sample2score']:
                score = h2h_dicts[theme]['sample2score'][sample]
            else:
                score = 0.5
            sample_features.append(score)
        X_theme.append(sample_features)

    # Train logistic regression on research data
    X_train = []
    y_train = []

    # Get research samples
    A_research = problem['split']['research']['A_samples']
    B_research = problem['split']['research']['B_samples']
    research_samples = A_research + B_research
    research_labels = [1] * len(A_research) + [0] * len(B_research)

    # Create feature matrix for research samples
    for i, sample in enumerate(research_samples):
        sample_features = []
        for theme in top_themes:
            # If the sample has a score for this theme, use it
            # Otherwise, use 0.5 as a neutral score
            if sample in h2h_dicts[theme]['sample2score']:
                score = h2h_dicts[theme]['sample2score'][sample]
            else:
                score = 0.5
            sample_features.append(score)
        X_train.append(sample_features)
        y_train.append(research_labels[i])

    # Train logistic regression
    lr = LogisticRegression(random_state=42)
    lr.fit(X_train, y_train)

    # Predict on validation data
    theme_predictions = lr.predict(X_theme)
    theme_loss = 1 - accuracy_score(validation_labels, theme_predictions)

    # 3. Non-parametric benchmark (using the LLM)
    # For this implementation, we'll use the validator to get scores for each sample
    # and use a threshold of 0.5 to make predictions

    # Create a hypothesis that directly asks about group membership
    direct_hypothesis = "This is a teaching sample from the treatment group."

    # Create input dictionaries for the validator
    validator_dicts = []
    for sample in validation_samples:
        validator_dict = {'hypothesis': direct_hypothesis, 'text': sample}
        validator_dicts.append(validator_dict)

    # Get scores from the validator
    if VERIFIER_NAME == "dummy":
        nonparam_verifier = DummyValidator()
    elif VERIFIER_NAME == "gpt":
        nonparam_verifier = GPTValidator()
    else:
        nonparam_verifier = Validator(VERIFIER_NAME, batch_size=32)

    nonparam_scores = list(nonparam_verifier.validate_w_scores(validator_dicts))

    # Convert scores to predictions
    nonparam_predictions = [1 if score > 0.5 else 0 for score in nonparam_scores]
    nonparam_loss = 1 - accuracy_score(validation_labels, nonparam_predictions)

    # Calculate completeness
    if trivial_loss == nonparam_loss:
        # If there's no predictive information in the text, return 0
        completeness = 0
    else:
        completeness = (trivial_loss - theme_loss) / (trivial_loss - nonparam_loss)

    return {
        'completeness': completeness,
        'trivial_loss': trivial_loss,
        'theme_loss': theme_loss,
        'nonparam_loss': nonparam_loss,
        'theme_accuracy': 1 - theme_loss,
        'nonparam_accuracy': 1 - nonparam_loss,
        'trivial_accuracy': 1 - trivial_loss
    }

# Calculate completeness
completeness_results = calculate_completeness(h2h_dicts, problem['split']['validation'], problem)
print("\nCompleteness Results:")
print(f"Completeness: {completeness_results['completeness']:.4f}")
print(f"Trivial Predictor Accuracy: {completeness_results['trivial_accuracy']:.4f}")
print(f"Theme-based Predictor Accuracy: {completeness_results['theme_accuracy']:.4f}")
print(f"Non-parametric Benchmark Accuracy: {completeness_results['nonparam_accuracy']:.4f}")

# Save completeness results to CSV
completeness_df = pd.DataFrame({
    'metric': ['completeness', 'trivial_accuracy', 'theme_accuracy', 'nonparam_accuracy'],
    'value': [
        completeness_results['completeness'],
        completeness_results['trivial_accuracy'],
        completeness_results['theme_accuracy'],
        completeness_results['nonparam_accuracy']
    ]
})
completeness_df.to_csv("results/completeness_simse_Treatment_non_experimental_v3.csv", index=False)

# Also save detailed results for each theme
theme_completeness = []
h_sorted = sorted(h2h_dicts, key=lambda h: h2h_dicts[h]['diff_w_significance']['mu'], reverse=True)
top_themes = h_sorted[:10]  # Analyze top 10 themes individually

for i, theme in enumerate(top_themes):
    # Calculate completeness using just this theme
    single_theme_results = calculate_completeness({theme: h2h_dicts[theme]}, problem['split']['validation'], problem)

    theme_completeness.append({
        'theme_rank': i + 1,
        'theme': h2h_dicts[theme]['hypothesis'],
        'completeness': single_theme_results['completeness'],
        'theme_accuracy': single_theme_results['theme_accuracy'],
        'V_score': h2h_dicts[theme]['diff_w_significance']['mu']
    })

theme_completeness_df = pd.DataFrame(theme_completeness)
theme_completeness_df.to_csv("results/theme_completeness_simse_Treatment_non_experimental_v3.csv", index=False)


KeyboardInterrupt: 