In [None]:
import pandas as pd
import h5py
import os

# ---------------------- Parameters ---------------------- #
data_dir = "data"
text_dir = "data/text"
fmri_reading_trn = "data/subject{}_reading_fmri_data_trn.hdf"
fmri_reading_val = "data/subject{}_reading_fmri_data_val.hdf"
fmri_listening_trn = "data/subject{}_listening_fmri_data_trn.hdf"
fmri_listening_val = "data/subject{}_listening_fmri_data_val.hdf"
subjects = [1, 2, 3, 5, 7, 8]  # List of subject numbers 

# ---------------------- fMRI Data Shapes (From Previous Code) ---------------------- #
# ... (Ensure that  data_shapes dictionary populated from  previous code in initial_notebook)

# ---------------------- Process Text Data ---------------------- #
text_data = {}
for filename in os.listdir(text_dir):
    if filename.endswith(".txt"):
        story_name = filename[:-4]  # Remove '.txt' extension
        with open(os.path.join(text_dir, filename), 'r') as f:
            text_data[story_name] = f.read()

# ---------------------- Create DataFrame ---------------------- #
all_data = []
for subject in subjects:
    for story_name, story_text in text_data.items():
        for task in ['reading', 'listening']:
            for split in ['trn', 'val']:
                fmri_filename = eval(f"fmri_{task}_{split}")  # Dynamically build filename
                fmri_file = fmri_filename.format(subject)

                # Check if the corresponding fMRI data exists
                if story_name in data_shapes and os.path.isfile(fmri_file):
                    fmri_shape = data_shapes[story_name]
                    all_data.append({
                        'subject': subject,
                        'story_name': story_name,
                        'text': story_text,
                        'task': task,
                        'split': split,
                        'fmri_shape': fmri_shape
                    })

# Create the final DataFrame
df = pd.DataFrame(all_data)

# ---------------------- Save DataFrame ---------------------- #
df.to_csv("data/df_text.hdf", index=False)


In [None]:
import pandas as pd
import os
import aeneas  # Make sure you have Aeneas installed: pip install aeneas 
import wave
from tqdm import tqdm  # Optional, for progress bar

# ---------------------- Parameters ---------------------- #
data_dir = "data"
audio_dir = "data/audio"
transcripts_dir = "stimuli"
aligned_audio_dir = "data/aligned_audio"
fmri_reading_trn = "data/subject{}_reading_fmri_data_trn.hdf" # Just keeping these for reference
fmri_reading_val = "data/subject{}_reading_fmri_data_val.hdf"
fmri_listening_trn = "data/subject{}_listening_fmri_data_trn.hdf"
fmri_listening_val = "data/subject{}_listening_fmri_data_val.hdf"
subjects = [1, 2, 3, 5, 7, 8] 

# ---------------------- Load DataFrame ---------------------- #
df = pd.read_csv("data/df_text.hdf")

# ---------------------- Process Audio Data ---------------------- #
for index, row in tqdm(df.iterrows(), desc="Aligning audio", total=df.shape[0]):  
    subject = row['subject']
    story_name = row['story_name']
    task = row['task']  # Assuming that you want to align only listening tasks
    audio_filename = os.path.join(audio_dir, f"{story_name}_{task}_{subject}.wav")
    transcript_filename = os.path.join(transcripts_dir, f"{story_name}.txt")
    aligned_audio_filename = os.path.join(aligned_audio_dir, f"{story_name}_{task}_{subject}.wav")

    if task == 'listening' and os.path.isfile(audio_filename) and os.path.isfile(transcript_filename):
        # Create necessary directories if they don't exist
        os.makedirs(aligned_audio_dir, exist_ok=True)

        # Create Aeneas text and audio objects
        text_obj = aeneas.TextFileParser(transcript_filename).parse() 
        audio_obj = aeneas.AudioFileParser(audio_filename).parse()

        # Align audio and text using Aeneas
        aeneas.SentenceAligner(audio_obj, text_obj).align() 

        # Extract fragment boundaries
        fragments = text_obj.fragments 

        # Save the new audio file with aligned fragments
        with wave.open(aligned_audio_filename, 'wb') as out_wave:
            out_wave.setparams(audio_obj.audio_file.params)  # Copy params of original audio
            for fragment in fragments:      
                fragment_audio = audio_obj.audio_file.get_fragment(fragment.begin, fragment.end)
                out_wave.writeframes(fragment_audio)  

# ---------------------- Update DataFrame (Optional) ---------------------- #
# If you want to add a column to your DataFrame indicating the aligned audio file paths,
# you can add the following:

df.loc[index, 'aligned_audio_file'] = aligned_audio_filename
df.to_csv("data/df_text.hdf", index=False)  # Update the DataFrame
