# Combining onset information and LLM segmentation
#### Steps

#### Works Cited
Michelmann, S., Kumar, M., Norman, K.A. et al. Large language models can segment narrative events similarly to humans. *Behav Res* 57, 39 (2025). https://doi.org/10.3758/s13428-024-02569-z

In [1]:
#!pip install openai

In [2]:
import pickle
all_iter = []
for i in range(6):
    file_path = f'../GPT_event_share/Pieman/outputs/Pieman_iter_{i}_version__Events.pkl'
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
        all_iter.append(data)

In [3]:
import pandas as pd

align_df = pd.read_csv('../gentle/pieman/align.csv', header=None)

In [4]:
# Parameters
n_trs = 300
tr_length = 1.5

# Create DataFrame
trs_df = pd.DataFrame({
    'TR': range(1, 1+n_trs),
    'Time_seconds': [tr * tr_length for tr in range(n_trs)]
})

In [5]:
# Rename columns for clarity
align_df.columns = ['word_raw', 'word_clean', 'start', 'end']

align_df = align_df.dropna()

In [6]:
# Assign TRs using the lag-adjusted start time
word_with_tr = pd.merge_asof(
    align_df,
    trs_df,
    left_on='start',
    right_on='Time_seconds',
    direction='backward'
)

# Clean up columns: drop raw TR time if not needed
word_with_tr = word_with_tr.drop(columns='Time_seconds')

In [7]:
# Indices that start and end times are not defined for (got dropped from align_df so must be dropped here as well)
dropped_indices = [420, 498, 880]

# Loop through all 6 segmentation variants
for seg_version in range(6):
    segment_ids = []
    flat_tokens = []

    for idx, segment in enumerate(all_iter[seg_version]):
        words = segment.split()
        flat_tokens.extend(words)
        segment_ids.extend([idx] * len(words))

    # Drop the same bad tokens
    flat_tokens_cleaned = [tok for i, tok in enumerate(flat_tokens) if i not in dropped_indices]
    segment_ids_cleaned = [sid for i, sid in enumerate(segment_ids) if i not in dropped_indices]

    # Add to main dataframe
    colname = f'segment_idx_{seg_version}'
    word_with_tr[colname] = segment_ids_cleaned[:len(word_with_tr)]


In [8]:
# Note: TR is aligned to real time, no lag
word_with_tr.to_csv("pieman_segments.csv", index=False)