In [1]:
#!pip install openai

In [2]:
import pickle
all_iter = []
for i in range(6):
    file_path = f'../GPT_event_share/Pieman/outputs/Pieman_iter_{i}_version__Events.pkl'
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
        all_iter.append(data)

In [3]:
all_iter[1][1]

"And one day I'm walking toward the campus center. And out comes the elusive Dean McGowan, architect of a policy to replace Fordham's traditionally working to middle class students with wealthier, more prestigious ones. So I whip out my notebook. And I go up to him and I say: Dean McGowan, is it true that Fordham University plans to raise tuition substantially above the inflation rate? And if so, wouldn't that be a betrayal of its mission?"

In [4]:
import pandas as pd

align_df = pd.read_csv('../gentle/pieman/align.csv', header=None)

print(align_df.head())

             0       1          2          3
0            I       i  15.089999  15.169999
1        began   began  15.170000  15.510000
2           my      my  15.509999  15.699999
3  illustrious   <unk>  15.710000  16.310000
4       career  career  16.330000  16.940000


In [5]:
# Parameters
n_trs = 282
tr_length = 1.5

# Create DataFrame
trs_df = pd.DataFrame({
    'TR': range(1, 1+n_trs),
    'Time_seconds': [tr * tr_length for tr in range(n_trs)]
})

print(trs_df)

      TR  Time_seconds
0      1           0.0
1      2           1.5
2      3           3.0
3      4           4.5
4      5           6.0
..   ...           ...
277  278         415.5
278  279         417.0
279  280         418.5
280  281         420.0
281  282         421.5

[282 rows x 2 columns]


In [6]:
# Rename columns for clarity
align_df.columns = ['word_raw', 'word_clean', 'start', 'end']

align_df = align_df.dropna()

# Set the lag in seconds (e.g., 4.5s)
hemodynamic_lag = 4.5

# Create a new column for adjusted start time
align_df['start_lagged'] = align_df['start'] + hemodynamic_lag

# Sort both DataFrames by time
align_df_sorted = align_df.sort_values('start_lagged')
trs_df_sorted = trs_df.sort_values('Time_seconds')

In [7]:
# Assign TRs using the lag-adjusted start time
word_with_tr = pd.merge_asof(
    align_df_sorted,
    trs_df_sorted,
    left_on='start_lagged',
    right_on='Time_seconds',
    direction='backward'
)

# Clean up columns: drop raw TR time if not needed
word_with_tr = word_with_tr.drop(columns='Time_seconds')

# View result
print(word_with_tr.head())

      word_raw word_clean      start        end  start_lagged  TR
0            I          i  15.089999  15.169999     19.589999  14
1        began      began  15.170000  15.510000     19.670000  14
2           my         my  15.509999  15.699999     20.009999  14
3  illustrious      <unk>  15.710000  16.310000     20.210000  14
4       career     career  16.330000  16.940000     20.830000  14


In [8]:
dropped_indices = [420, 498, 880]

# Loop through all 6 segmentation variants
for seg_version in range(6):
    segment_ids = []
    flat_tokens = []

    for idx, segment in enumerate(all_iter[seg_version]):
        words = segment.split()
        flat_tokens.extend(words)
        segment_ids.extend([idx] * len(words))

    # Drop the same bad tokens
    flat_tokens_cleaned = [tok for i, tok in enumerate(flat_tokens) if i not in dropped_indices]
    segment_ids_cleaned = [sid for i, sid in enumerate(segment_ids) if i not in dropped_indices]

    # Add to main dataframe
    colname = f'segment_idx_{seg_version}'
    word_with_tr[colname] = segment_ids_cleaned[:len(word_with_tr)]


In [16]:
# NOTE TR IS THE LAGGED TR
word_with_tr.to_csv("pieman_segments.csv", index=False)