In [None]:
import pandas as pd
import json
from tqdm import tqdm

In [None]:
def load_dataset(file_path):
    if file_path.endswith('.csv'):
        return pd.read_csv(file_path)
    elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
        return pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Use CSV or Excel.")

In [None]:
import pandas as pd
import json
from tqdm import tqdm
import os

def label_dataset_with_checkpoints(df, checkpoint_path="label_checkpoint.csv", text_column="text"):
    """
    Interactively label a dataset row-by-row with resume capability using checkpoints.

    Args:
        df (pd.DataFrame): Input DataFrame to label.
        checkpoint_path (str): Path to the checkpoint file (CSV).
        text_column (str): Column to display for labeling.

    Returns:
        pd.DataFrame: Labeled DataFrame with a 'label' column.
    """

    # Load checkpoint if it exists
    if os.path.exists(checkpoint_path):
        checkpoint_df = pd.read_csv(checkpoint_path)
        print(f"✅ Loaded checkpoint with {len(checkpoint_df)} labeled rows.")
    else:
        checkpoint_df = pd.DataFrame(columns=list(df.columns) + ['label'])

    # Determine where to resume
    start_idx = len(checkpoint_df)
    print(f"Starting from row {start_idx} of {len(df)}.\n")

    labels = []
    for i in tqdm(range(start_idx, len(df)), desc="Labeling Rows"):
        row = df.iloc[i].copy()
        row_dict = row.to_dict()

        # Display selected text column
        print(f"\nRow {i}: {row_dict.get('candidate_string', '[Column not found]')}\n")

        # Save current row to JSON for inspection
        with open("temp.json", "w") as f:
            json.dump(row_dict.get(text_column, '[Column not found]'), f, indent=2)

        while True:
            label = input("Enter label (0/1) or 2 to save and exit: ").strip()
            if label in ['0', '1']:
                row['label'] = int(label)
                checkpoint_df = pd.concat([checkpoint_df, pd.DataFrame([row])], ignore_index=True)
                checkpoint_df.to_csv(checkpoint_path, index=False)
                break
            elif label == '2':
                print(f"Progress saved to '{checkpoint_path}'. Exiting early.")
                return checkpoint_df
            else:
                print("Invalid input. Please enter 0, 1, or 2.")

    print(f"\n✅ All rows labeled. Final dataset saved to '{checkpoint_path}'.")
    return checkpoint_df

        

In [None]:
# ---- USAGE ----
file_path = "studio-ousia_luke-base_486_flagged.csv"  # <-- Change this to your file path
df = load_dataset(file_path)
df = label_dataset_with_checkpoints(df)