In [1]:
import os
import pandas as pd
import numpy as np

In [6]:
def get_participant_model_column(full_folder_path, participant_id, model):

    fname_list = []
    transcript_list = []
    participant_id_list = []
    sentence_ids_list = []
    iterations_list = []
    
    for file in os.listdir(full_folder_path):
        if 'whisper' not in file and file.endswith('.txt') and model in file:
            
            fname_list.append(file)
            
            full_fname = os.path.join(full_folder_path, file)
            ids = file.split('_')[6:9]
            sentence_id = ids[0] + '_' + ids[1]
            sentence_ids_list.append(sentence_id)

            iteration = ids[2]
            iterations_list.append(iteration)
            
            with open(full_fname, 'r') as f:
                transcript = f.read()
                transcript = transcript.strip()
                #print(transcript)
                transcript_list.append(transcript)

            participant_id_list.append(participant_id)

    result_df = pd.DataFrame({'participant_id':participant_id_list})

    result_df['file_name'] = fname_list # kept for debugging

    result_df['Sentence ID'] = sentence_ids_list

    result_df['Iteration'] = iterations_list

    result_df[model] = transcript_list

    return result_df

In [22]:
# function test 
'''get_participant_model_column('/Users/cogsci-lasrlab1/Desktop/MTAA_recording/t2/t3_sauce', 
                             't3',
                            'large')'''

"get_participant_model_column('/Users/cogsci-lasrlab1/Desktop/MTAA_recording/t2/t3_sauce', \n                             't3',\n                            'large')"

In [23]:
def merging_models_per_participant(full_folder_path, participant_id):
    df_merged = get_participant_model_column(
    full_folder_path, participant_id, 'large')
    
    model_list = ['base', 'medium', 'small', 'tiny']
    for model in model_list:
        model_df = get_participant_model_column(
            full_folder_path, participant_id, model)
        
        df_merged = df_merged.merge(
            model_df[['participant_id', 'Sentence ID', 'Iteration', model]],
            on = ['participant_id', 'Sentence ID', 'Iteration'],
            how = 'inner')

    return df_merged
    

In [24]:
# function test 
'''participant_df = merging_models_per_participant('/Users/cogsci-lasrlab1/Desktop/MTAA_recording/t2/t2_sauce', 
                             't2')'''

"participant_df = merging_models_per_participant('/Users/cogsci-lasrlab1/Desktop/MTAA_recording/t2/t2_sauce', \n                             't2')"

In [30]:
def merge_with_targets(participant_df):
    target_sentence_df = pd.read_csv('target_sentences.csv')
    
    merged_df = participant_df.merge(
        target_sentence_df, on = 'Sentence ID', how = 'left')

    # important log: added file_name column on 9/1,
    # have not rerun all dataframes to include yet
    merged_df = merged_df[['participant_id', 'file_name', 'Sentence ID', 
                           'Iteration', 'probability', 'Expected Transcription',
                           'large', 'base', 'medium', 
                           'small', 'tiny']].reindex()
    return merged_df
    

In [26]:
# function test 
'''merge_with_targets(participant_df)'''

'merge_with_targets(participant_df)'

In [27]:
def full_txt_to_df(full_sauce_folder_path, participant_id, destination_path):
    participant_df = merging_models_per_participant(
        full_sauce_folder_path, participant_id)

    full_df = merge_with_targets(participant_df)
    
    full_df.to_csv(destination_path)

    return full_df

In [28]:
# function test
'''
full_txt_to_df('/Users/cogsci-lasrlab1/Desktop/MTAA_recording/t2/t2_sauce', 
               't2', 
               '/Users/cogsci-lasrlab1/Desktop/MTAA_recording/t2/t2_sauce/t2_sauce_test.csv')
'''

"\nfull_txt_to_df('/Users/cogsci-lasrlab1/Desktop/MTAA_recording/t2/t2_sauce', \n               't2', \n               '/Users/cogsci-lasrlab1/Desktop/MTAA_recording/t2/t2_sauce/t2_sauce_test.csv')\n"

In [29]:
# interactive pipeline for cleaning
for i in range(74, 78):
    x = input("press c to continue, any other key to stop")
    if x == "c":
        folder_name = f't{i}'
        input_path = f'/Users/cogsci-lasrlab1/Desktop/MTAA_recording/{folder_name}/{folder_name}_sauce'
        output_path = f'/Users/cogsci-lasrlab1/Desktop/MTAA_recording/{folder_name}/{folder_name}_sauce/{folder_name}_sauce.csv'
        print(f'{input_path} \n {folder_name} \n {output_path}')
        result = full_txt_to_df(input_path, folder_name, output_path)
        print(result.shape)
        print(result.head(2))
        
    else:
        print('stopped!')
        break

press c to continue, any other key to stop l


stopped!
