In [None]:
import json
import pathlib
from typing import List, Tuple, Dict

import pandas as pd

# Input ↔ Output locations
DATA_PATH = pathlib.Path(
    "/Users/mkrasnow/Desktop/montesa/new/formattedData/peru_cleaned_transcripts.csv"
)
OUT_PATH = DATA_PATH.with_name("TIMESTAMPED_peru_cleaned_transcripts.csv")

# Snapshot windows (start, end) in **seconds**
SNAPSHOT_WINDOWS: Dict[int, Tuple[float, float]] = {
    1: (4 * 60, 5 * 60),      # 240 s ≤ t < 300 s
    2: (9 * 60, 10 * 60),     # 540 s ≤ t < 600 s
    3: (14 * 60, 15 * 60),    # 840 s ≤ t < 900 s
}

# %% [markdown]
# ## 1  Utility Functions

# %%
def _to_seconds(t: str | float | int) -> float:
    """
    Convert Google‐style time strings or numeric values to seconds.
    Examples
    --------
    "243.300s" → 243.3  
    243.3      → 243.3
    """
    if isinstance(t, (int, float)):
        return float(t)
    t = t.strip().lower().rstrip("s")
    return float(t)


def inject_snapshot_tags(words: List[Dict], windows: Dict[int, Tuple[float, float]]) -> str:
    """
    Given a list of word‐level dicts, wrap segments falling in each snapshot
    window with opening/closing tags.

    Parameters
    ----------
    words    : list of {"word": str, "startTime": …, "endTime": …}
    windows  : {snap_id: (t_start, t_end), …}

    Returns
    -------
    Annotated transcript string.
    """
    # Track which window we are currently inside
    open_windows = {k: False for k in windows}
    pieces: List[str] = []

    for w in words:
        start = _to_seconds(w.get("startTime", 0))
        token = w.get("word", "")
        # Handle opening tags
        for snap_id, (t0, t1) in windows.items():
            if (t0 <= start < t1) and not open_windows[snap_id]:
                pieces.append(f"<SNAPSHOT {snap_id}>")
                open_windows[snap_id] = True
        # Append the word itself
        pieces.append(token)
        # Handle closing tags **after** appending the word so punctuation stays inside
        for snap_id, (t0, t1) in windows.items():
            if open_windows[snap_id] and (start >= t1 - 0.0001):
                pieces.append(f"</SNAPSHOT {snap_id}>")
                open_windows[snap_id] = False

    # Gracefully close any window that never got closed (edge cases)
    for snap_id, is_open in open_windows.items():
        if is_open:
            pieces.append(f"</SNAPSHOT {snap_id}>")

    # Combine with spaces, then clean up stray whitespace around tags
    transcript = " ".join(pieces).replace("  ", " ")
    return transcript.strip()


def process_json_column(col: pd.Series) -> List[str]:
    """
    Parse a Series of JSON strings and return a list of annotated transcripts.
    """
    annotated_texts = []
    for raw in col:
        if pd.isna(raw):
            annotated_texts.append(raw)
            continue
        try:
            blob = json.loads(raw)
        except (json.JSONDecodeError, TypeError):
            annotated_texts.append(raw)
            continue

        # Prefer 'words'; fall back to naive full‐text tagging (rare)
        if blob.get("words"):
            annotated = inject_snapshot_tags(blob["words"], SNAPSHOT_WINDOWS)
        else:
            annotated = blob.get("text", "")
        annotated_texts.append(annotated)
    return annotated_texts

# %% [markdown]
# ## 2  Load Data

# %%
print(f"Reading: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)

expected_cols = {
    "First Audio Transcript_JSON",
    "Last Audio Transcript_JSON",
    "First Audio Transcript Text",
    "Last Audio Transcript Text",
}
missing = expected_cols.difference(df.columns)
if missing:
    raise KeyError(f"Missing expected columns: {missing}")

print(f"Loaded {len(df):,} rows.")

# %% [markdown]
# ## 3  Annotate Transcripts

# %%
df["First Audio Transcript Text"] = process_json_column(df["First Audio Transcript_JSON"])
df["Last Audio Transcript Text"] = process_json_column(df["Last Audio Transcript_JSON"])

# Optional sanity check: show the first row
display_cols = [
    "First Audio Transcript Text",
    "Last Audio Transcript Text",
]
print("\n--- Preview of annotated text (row 0) ---")
print(df.loc[0, display_cols].to_string(index=False)[:800] + "…")

# %% [markdown]
# ## 4  Save Result

# %%
df.to_csv(OUT_PATH, index=False)
print(f"\nSaved → {OUT_PATH}")

Reading: /Users/mkrasnow/Desktop/montesa/new/formattedData/peru_cleaned_transcripts.csv
Loaded 203 rows.

--- Preview of annotated text (row 0) ---
¿El texto? Descriptivo, muy bien. Muy bien (pau...
Unos barcos a nadar ahí y, y, y todos se van a ...…

Saved → /Users/mkrasnow/Desktop/montesa/new/formattedData/TIMESTAMPED_peru_cleaned_transcripts.csv
