In [2]:
# Desired evaluation column names
component_names = [
    "Teacher provides learning activity - 1st Snapshot",
    "Students are on task - 1st Snapshot",
    "Teacher provides learning activity - 2nd Snapshot",
    "Students are on task - 2nd Snapshot",
    "Teacher provides learning activity - 3rd Snapshot",
    "Students are on task - 3rd Snapshot",
    "Supportive Learning Environment",
    "The teacher treats all students respectfully",
    "The teacher uses positive language",
    "The teacher responds to students needs",
    "The teacher does not exhibit gender bias",
    "Positive Behavioral Expectations",
    "The teacher sets clear behavioral expectations",
    "The teacher acknowledges positive student behavior",
    "The teacher redirects misbehavior",
    "Lesson Facilitation",
    "The teacher explicitly articulates learning objectives",
    "The teacher's explanation of content is clear",
    "The teacher makes connections in the lesson",
    "The teacher models by enacting or thinking aloud",
    "Checks for understanding",
    "The teacher uses questions",
    "The teacher uses prompts",
    "The teacher monitors most students",
    "The teacher adjusts teaching to the level of students",
    "Feedback",
    "The teacher provides specific comments for misunderstandings",
    "The teacher provides specific comments for successes",
    "Critical Thinking",
    "The teacher asks open-ended questions",
    "The teacher provides thinking tasks",
    "Students ask open-ended questions or perform thinking tasks",
    "Autonomy",
    "The teacher provides students with choices",
    "The teacher provides students with opportunities to take meaningful roles",
    "Students volunteer to participate in the classroom",
    "Perseverance",
    "The teacher acknowledges students' efforts",
    "The teacher has a positive attitude towards students' challenges",
    "The teacher encourages goal-setting",
    "Social & Collaborative Skills",
    "The teacher promotes students' collaboration",
    "The teacher promotes students' interpersonal skills"
]

In [1]:
# --- Hardcoded paths ---
INPUT_CSV = '/Users/mkrasnow/Desktop/montesa/new/formattedData/peru_cleaned_transcripts.csv'
OUTPUT_CSV = '/Users/mkrasnow/Desktop/montesa/new/formattedData/val_peru_cleaned_transcripts.csv'


In [None]:
#!/usr/bin/env python3
"""
This script reads a CSV with existing 'train' and 'test' splits,
carves out 20% of the training set as a validation fold (stratified),
labels it 'val', and writes a new CSV.
"""
!pip install -q iterative-stratification

import pandas as pd

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.preprocessing import MultiLabelBinarizer


# --- Main logic ---
def main():
    # Load data
    df = pd.read_csv(INPUT_CSV)

    # Identify evaluation columns (must match the list used originally)
    # Assumes `component_names` is already defined in this environment
    eval_cols = [col for col in component_names if col in df.columns]

    # Build multilabel targets array Y
    labels = []
    for _, row in df[eval_cols].iterrows():
        row_labels = [f"{col}={row[col]}" for col in eval_cols if row[col] != 'N/A']
        labels.append(row_labels)

    mlb = MultiLabelBinarizer()
    Y = mlb.fit_transform(labels)

    # Filter to the existing train set
    train_df = df[df['split'] == 'train']
    train_idx = train_df.index.to_numpy()
    Y_train = Y[train_idx]

    # Stratified split: reserve 20% of train as validation
    splitter = MultilabelStratifiedShuffleSplit(
        n_splits=1, test_size=0.2, random_state=42
    )
    _, val_rel_idx = next(splitter.split(train_df, Y_train))
    val_idx = train_idx[val_rel_idx]

    # Assign the 'val' label alongside existing 'train'/'test'
    df.loc[val_idx, 'split'] = 'val'

    # Save to new CSV
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"✅ Saved new file with validation split to: {OUTPUT_CSV}")

if __name__ == '__main__':
    main()


✅ Saved new file with validation split to: /Users/mkrasnow/Desktop/montesa/new/formattedData/val_peru_cleaned_transcripts.csv


In [5]:
import pandas as pd 

output_file = '/Users/mkrasnow/Desktop/montesa/new/formattedData/test_only_peru_cleaned_transcripts.csv'

df = pd.read_csv(OUTPUT_CSV)

# Isolate rows where split == 'val'
df_val = df[df['split'] == 'test'].copy()

# Change the split value from 'val' to 'train'
# df_val['split'] = 'train'

# Save the resulting DataFrame to a new CSV
df_val.to_csv(output_file, index=False)
print(f"Filtered and updated rows saved to {output_file}")

Filtered and updated rows saved to /Users/mkrasnow/Desktop/montesa/new/formattedData/test_only_peru_cleaned_transcripts.csv
