# jiwer error counting pipeline

In [3]:
import jiwer
from jiwer import wer
import pandas as pd
import os

In [2]:
# standardize string inputs to make more accurate wer counts
def pre_jiwer_standardize(any_string):
    wer_standardize = jiwer.Compose(
    [
        jiwer.ToLowerCase(),
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveKaldiNonWords(),
        jiwer.RemoveWhiteSpace(replace_by_space=True),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
    ]
    )
    any_string = any_string.replace("neighbourhood", "neighborhood")

    return wer_standardize(any_string)

In [3]:
# applying transformation for preprocessing prompt/transcript columns
def transform_col_text(df, col_name, new_col_name):
    df_copy = df.copy()
    df_copy.loc[:,new_col_name] = df_copy[col_name].apply(pre_jiwer_standardize)
    return df_copy

In [4]:
# creating a new wer getting row-wise error rates for transformed strings in col1, col2
def wer_transformed(df, wer_col, col1, col2):
    df_copy = df.copy()
    df_copy.loc[:, wer_col] = df.apply(lambda x:
                                  wer(x[col1], x[col2]), axis=1)
    return df_copy

## testing functions

In [5]:
# test
pre_jiwer_standardize("I'm good neighbourhood")

'i am good neighborhood'

In [6]:
# test
df_orig = pd.read_csv('test_data/oishani_test_67d9_whisper.csv')
df_transformed = transform_col_text(df_orig, 'prompt_text', 'transformed_prompt_text')
df_transformed.head(2)

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,session_id,participant_id,stimuli_presented,response_type,response_name,response_value,lang_bg_q3_resp,lang_bg_q3_english_only,prompt_text,whisper_transcript,whisper_jiwer_simple,prompt_type,transformed_prompt_text
0,0,181,181,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,ExpPrompt10,audio-response,ExpPromptResp10,53c7dbc41f259d6f7157d364-12-ExpPromptResp10.mp4,['Other (you may specify in the next slide)'],False,The beige hue on the waters of the loch impres...,It reached Hugh on the waters of the loch impr...,0.192308,Exp,the beige hue on the waters of the loch impres...
1,1,182,182,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,DialPrompt3,audio-response,DialPromptResp3,53c7dbc41f259d6f7157d364-13-DialPromptResp3.mp4,['Other (you may specify in the next slide)'],False,She gives it a one of ten when often it deserv...,She gives it a one of ten when often it deserv...,0.0,Dial,she gives it a one of ten when often it deserv...


In [7]:
# testing with second transformed col
df_transformed = transform_col_text(df_transformed, 'whisper_transcript', 'transformed_whisper_transcript')
df_transformed.head(2)

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,session_id,participant_id,stimuli_presented,response_type,response_name,response_value,lang_bg_q3_resp,lang_bg_q3_english_only,prompt_text,whisper_transcript,whisper_jiwer_simple,prompt_type,transformed_prompt_text,transformed_whisper_transcript
0,0,181,181,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,ExpPrompt10,audio-response,ExpPromptResp10,53c7dbc41f259d6f7157d364-12-ExpPromptResp10.mp4,['Other (you may specify in the next slide)'],False,The beige hue on the waters of the loch impres...,It reached Hugh on the waters of the loch impr...,0.192308,Exp,the beige hue on the waters of the loch impres...,it reached hugh on the waters of the loch impr...
1,1,182,182,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,DialPrompt3,audio-response,DialPromptResp3,53c7dbc41f259d6f7157d364-13-DialPromptResp3.mp4,['Other (you may specify in the next slide)'],False,She gives it a one of ten when often it deserv...,She gives it a one of ten when often it deserv...,0.0,Dial,she gives it a one of ten when often it deserv...,she gives it a one of ten when often it deserv...


In [8]:
# testing with both preprocessed functions
df_wer = wer_transformed(df_transformed, 'whisper_jiwer_transformed', 'transformed_prompt_text', 'transformed_whisper_transcript')
df_wer

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,session_id,participant_id,stimuli_presented,response_type,response_name,response_value,lang_bg_q3_resp,lang_bg_q3_english_only,prompt_text,whisper_transcript,whisper_jiwer_simple,prompt_type,transformed_prompt_text,transformed_whisper_transcript,whisper_jiwer_transformed
0,0,181,181,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,ExpPrompt10,audio-response,ExpPromptResp10,53c7dbc41f259d6f7157d364-12-ExpPromptResp10.mp4,['Other (you may specify in the next slide)'],False,The beige hue on the waters of the loch impres...,It reached Hugh on the waters of the loch impr...,0.192308,Exp,the beige hue on the waters of the loch impres...,it reached hugh on the waters of the loch impr...,0.192308
1,1,182,182,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,DialPrompt3,audio-response,DialPromptResp3,53c7dbc41f259d6f7157d364-13-DialPromptResp3.mp4,['Other (you may specify in the next slide)'],False,She gives it a one of ten when often it deserv...,She gives it a one of ten when often it deserv...,0.0,Dial,she gives it a one of ten when often it deserv...,she gives it a one of ten when often it deserv...,0.0
2,2,183,183,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt18,audio-response,LocPromptResp18,53c7dbc41f259d6f7157d364-14-LocPromptResp18.mp4,['Other (you may specify in the next slide)'],False,"What da, why you keep laughing?",Why you keep laughing?,0.5,Loc,"what da, why you keep laughing?",why you keep laughing?,0.333333
3,3,184,184,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,ExpPrompt5,audio-response,ExpPromptResp5,53c7dbc41f259d6f7157d364-15-ExpPromptResp5.mp4,['Other (you may specify in the next slide)'],False,You haven't even been to the In-n-Out in the O...,You haven't even been to the In-N-Out in the O...,0.166667,Exp,you have not even been to the in-n-out in the ...,you have not even been to the in-n-out in the ...,0.0
4,4,185,185,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,HardPrompt7,audio-response,HardPromptResp7,53c7dbc41f259d6f7157d364-16-HardPromptResp7.mp4,['Other (you may specify in the next slide)'],False,"Though it's a bastion of wonder, the bougie bu...","Though it's a bastion of wonder, the Boogie Bu...",0.25,Hard,"though it is a bastion of wonder, the bougie b...","though it is a bastion of wonder, the boogie b...",0.153846
5,5,186,186,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt31,audio-response,LocPromptResp31,53c7dbc41f259d6f7157d364-17-LocPromptResp31.mp4,['Other (you may specify in the next slide)'],False,She is leaving for Bangalore tomorrow.,She is leaving for Bangalore tomorrow.,0.0,Loc,she is leaving for bangalore tomorrow.,she is leaving for bangalore tomorrow.,0.0


In [9]:
df_wer.loc[df_wer['stimuli_presented'] == 'LocPrompt18']['session_id']

2    67d94e1a42f12a9364065df2
Name: session_id, dtype: object

In [10]:
df_wer.loc[df_wer['stimuli_presented'] == 'LocPrompt18']['response_value']

2    53c7dbc41f259d6f7157d364-14-LocPromptResp18.mp4
Name: response_value, dtype: object

In [11]:
df_wer.loc[df_wer['stimuli_presented'] == 'LocPrompt18']['transformed_whisper_transcript']

2    why you keep laughing?
Name: transformed_whisper_transcript, dtype: object

In [12]:
df_wer.loc[df_wer['stimuli_presented'] == 'LocPrompt18']['transformed_prompt_text']

2    what da, why you keep laughing?
Name: transformed_prompt_text, dtype: object

In [13]:
df_wer.loc[df_wer['stimuli_presented'] == 'LocPrompt18']['participant_id']

2    9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...
Name: participant_id, dtype: object

## non-jiwer old error counter (my own script)

This script might be more reliable for non-words, but we should get very similar results

In [14]:
#error finder
def to_list(sentence):
    return sentence.split()

def depunctuate(word_list):
    punc = r'''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    depunct_word_list = []
    for word in word_list:
        for char in word:
            if char in punc:
                word = word.replace(char, "")
        depunct_word_list.append(word.lower())
    return depunct_word_list

def equalize_list_lengths(targ_list, err_list):
    if len(err_list) > len(targ_list):
        targ_list.extend([""] * (len(err_list) - len(targ_list)))
    elif len(targ_list) > len(err_list):
        err_list.extend([""] * (len(targ_list) - len(err_list)))
    return [targ_list, err_list]

def get_error_words(ready_targ_list, ready_err_list):
    targ_leftovers = []
    for word in ready_targ_list:
        if word in ready_err_list:
            ready_err_list.remove(word)
        else:
            targ_leftovers.append(word)

    ready_targ_list = [word for word in targ_leftovers if word]
    ready_err_list = [word for word in ready_err_list if word]

    return [ready_targ_list, ready_err_list]

def counter(final_targ_list, final_err_list):
    return max(len(final_targ_list), len(final_err_list))

def err_rate(final_targ_list, final_err_list, target_sentence, error_sentence):
    return counter(final_targ_list, final_err_list)/(len(to_list(target_sentence)))

def error_finder(target_sentence, error_sentence):
    # Step 1: Tokenize
    targ_list = to_list(target_sentence)
    err_list = to_list(error_sentence)

    # Step 2: Remove punctuation and lowercase
    depunct_targ_list = depunctuate(targ_list)
    depunct_err_list = depunctuate(err_list)

    # Step 3: Equalize lengths
    ready_targ_list, ready_err_list = equalize_list_lengths(depunct_targ_list, depunct_err_list)

    # Step 4: Identify error words
    final_targ_list, final_err_list = get_error_words(ready_targ_list, ready_err_list)
    #useful for analysis and debugging:
    #print("List of words missing from prompt in transcription: ", final_targ_list)
    #print("List of words in transcription not present in prompt: ", final_err_list)

    # Step 5: Count total word-level errors
    final_err_count = counter(final_targ_list, final_err_list)

    # Step 6: Get word-level error rate
    final_err_rate = err_rate(final_targ_list, final_err_list, target_sentence, error_sentence)

    return [final_targ_list, final_err_list, final_err_count, final_err_rate]

In [15]:
# sentence test
targ = "what da, why you keep laughing?"
err = "why you keep laughing?"
error_finder(targ, err)

[['what', 'da'], [], 2, 0.3333333333333333]

In [16]:
# creating a new wer getting row-wise error rates for strings in col1, col2
def custom_wer(df, wer_col, col1, col2):
    df_copy = df.copy()
    df_copy.loc[:, wer_col] = df.apply(lambda x:
                                  error_finder(x[col1], x[col2])[3], axis=1)
    return df_copy

In [17]:
# testing with both preprocessed functions
df_wer_custom = custom_wer(df_wer, 'whisper_custom_wer', 'transformed_prompt_text', 'transformed_whisper_transcript')
df_wer_custom

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,session_id,participant_id,stimuli_presented,response_type,response_name,response_value,lang_bg_q3_resp,lang_bg_q3_english_only,prompt_text,whisper_transcript,whisper_jiwer_simple,prompt_type,transformed_prompt_text,transformed_whisper_transcript,whisper_jiwer_transformed,whisper_custom_wer
0,0,181,181,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,ExpPrompt10,audio-response,ExpPromptResp10,53c7dbc41f259d6f7157d364-12-ExpPromptResp10.mp4,['Other (you may specify in the next slide)'],False,The beige hue on the waters of the loch impres...,It reached Hugh on the waters of the loch impr...,0.192308,Exp,the beige hue on the waters of the loch impres...,it reached hugh on the waters of the loch impr...,0.192308,0.115385
1,1,182,182,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,DialPrompt3,audio-response,DialPromptResp3,53c7dbc41f259d6f7157d364-13-DialPromptResp3.mp4,['Other (you may specify in the next slide)'],False,She gives it a one of ten when often it deserv...,She gives it a one of ten when often it deserv...,0.0,Dial,she gives it a one of ten when often it deserv...,she gives it a one of ten when often it deserv...,0.0,0.0
2,2,183,183,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt18,audio-response,LocPromptResp18,53c7dbc41f259d6f7157d364-14-LocPromptResp18.mp4,['Other (you may specify in the next slide)'],False,"What da, why you keep laughing?",Why you keep laughing?,0.5,Loc,"what da, why you keep laughing?",why you keep laughing?,0.333333,0.333333
3,3,184,184,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,ExpPrompt5,audio-response,ExpPromptResp5,53c7dbc41f259d6f7157d364-15-ExpPromptResp5.mp4,['Other (you may specify in the next slide)'],False,You haven't even been to the In-n-Out in the O...,You haven't even been to the In-N-Out in the O...,0.166667,Exp,you have not even been to the in-n-out in the ...,you have not even been to the in-n-out in the ...,0.0,0.0
4,4,185,185,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,HardPrompt7,audio-response,HardPromptResp7,53c7dbc41f259d6f7157d364-16-HardPromptResp7.mp4,['Other (you may specify in the next slide)'],False,"Though it's a bastion of wonder, the bougie bu...","Though it's a bastion of wonder, the Boogie Bu...",0.25,Hard,"though it is a bastion of wonder, the bougie b...","though it is a bastion of wonder, the boogie b...",0.153846,0.153846
5,5,186,186,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt31,audio-response,LocPromptResp31,53c7dbc41f259d6f7157d364-17-LocPromptResp31.mp4,['Other (you may specify in the next slide)'],False,She is leaving for Bangalore tomorrow.,She is leaving for Bangalore tomorrow.,0.0,Loc,she is leaving for bangalore tomorrow.,she is leaving for bangalore tomorrow.,0.0,0.0


In [18]:
pd.set_option('display.max_colwidth', None)
df_wer.loc[df_wer['stimuli_presented'] == 'ExpPrompt10']['transformed_prompt_text']

0    the beige hue on the waters of the loch impressed all, including the french queen, before she heard that symphony again, just as young arthur wanted
Name: transformed_prompt_text, dtype: object

In [19]:
df_wer.loc[df_wer['stimuli_presented'] == 'ExpPrompt10']['transformed_whisper_transcript']

0    it reached hugh on the waters of the loch impressed all, including the french queen before she heard that symphony again, just as young arthur wanted.
Name: transformed_whisper_transcript, dtype: object

In [20]:
df_wer.loc[df_wer['stimuli_presented'] == 'ExpPrompt5']['transformed_whisper_transcript']

3    you have not even been to the in-n-out in the outback steakhouse neighborhood?
Name: transformed_whisper_transcript, dtype: object

In [21]:
df_wer.loc[df_wer['stimuli_presented'] == 'ExpPrompt5']['transformed_prompt_text']

3    you have not even been to the in-n-out in the outback steakhouse neighborhood?
Name: transformed_prompt_text, dtype: object

In [22]:
df_wer.loc[df_wer['stimuli_presented'] == 'ExpPrompt5']['prompt_text']

3    You haven't even been to the In-n-Out in the Outback Steakhouse neighbourhood? 
Name: prompt_text, dtype: object

In [23]:
df_wer.loc[df_wer['stimuli_presented'] == 'ExpPrompt5']['whisper_transcript']

3    You haven't even been to the In-N-Out in the Outback Steakhouse neighborhood?
Name: whisper_transcript, dtype: object

# LASR Test Files

#### Get sentences (bad numbering) from participant folder into a dataframe

Here, the sentences are just numbered 1 through to the end, not aligned to the actual target sentence codes. After their alignment is fixed, the rest can be filled with NA values to make sure the target sentence and error sentences are correctly aligned.

In [24]:
def create_participant_csv(folder_path): # example: 'SAUCE_targetSentence/t34'
    
    folder_name = os.path.split(folder_path)[1] # get folder name, eg: t34
    
    df_name = f'df_{folder_name}'
    df = pd.DataFrame({'file_num': []}) # create empty dataframe titled folder name, eg: df_t34

    file_nums = []
    sentences = []
    for i in os.listdir(folder_path):
        if i.split('.')[1] == 'txt' and i.split('.')[0] != 'timestamps_log':
            file_nums.append(int(i.split('.')[0]))
              
    file_nums.sort()
            
    df['file_num'] =  file_nums

    for i in file_nums:
        with open(f'{folder_path}/{i}.txt') as f:
                sentences.append(f.read())
        
    df['sentence'] = sentences

    df.to_csv(f'{folder_path}/{df_name}.csv')
    
    return df

In [25]:
# test
df = create_participant_csv('test_data/t34')
df

Unnamed: 0,file_num,sentence
0,1,The monkey ate the shark.
1,2,The runky ate the shark.
2,3,Mom likes to use the elevator instead of the stairs.
3,4,Pom likes to use the elevator instead of the dragon.
4,5,Mom likes to use the elevator instead of the dragon.
...,...,...
204,205,"On Christmas, Mommy left cookies for Santa."
205,206,"On Christmas, Mommy left cookies for us."
206,207,There is one bathroom for the boys and one for aunts.
207,208,There is one hearth room for the boys and one for the aunts.


In [26]:
df.loc[df['file_num'][1:6]]

Unnamed: 0,file_num,sentence
2,3,Mom likes to use the elevator instead of the stairs.
3,4,Pom likes to use the elevator instead of the dragon.
4,5,Mom likes to use the elevator instead of the dragon.
5,6,The king wore a shiny gold crown.
6,7,The king wore a shiny gold box.


#### Creating single txt files for each folder

This will go through the txt files in each participant's folder and add them to a singular txt file in the folder in the same order. This will make it easy to copy and paste them over to an excel sheet for manual alignment by dragging and dropping to the correct sentences.

In [27]:
def get_one_txt_in_order(participant_folder): # eg, SAUCE_targetSentence/t01

    timest_file = 'timestamps_log.txt'

    sauce_less_folder = os.path.split(participant_folder)[1] # t01
    #print(sauce_less_folder)
    participant_txt = f'{sauce_less_folder}.txt' # t01.txt
    final_txt_path = os.path.join(participant_folder, participant_txt) # SAUCE_targetSentence/t01/t01.txt
    #print(final_txt_path)

    int_txts = [] # to make sure we keep them in order when we write them in
    for file in os.listdir(participant_folder):
        if file.endswith('.txt') and file != participant_txt and file != timest_file:
            int_txts.append(int(file.split('.')[0])) # add the int number of txt file
    int_txts.sort() # get int list of files numbered in order
    
    file_list = []
    for file_int in int_txts:
        file_list.append(f'{file_int}.txt') # recreate numbered file names: 1.txt, 2.txt, etc    

    if not os.path.exists(final_txt_path): # if SAUCE_targetSentence/t01/t01.txt not already created
        with open(final_txt_path, 'a') as output_file: # open SAUCE_targetSentence/t01/t01.txt
            for file in file_list: # eg, 1.txt, 2.txt, 3.txt, ...
                
                file_path = os.path.join(participant_folder, file) # eg, SAUCE_targetSentence/t01/1.txt
                
                with open(file_path, 'r') as input_file: # read SAUCE_targetSentence/t01/1.txt
                    text_line = input_file.read()
                    # print(text_line) # print it to check
    
                output_file.write(text_line) # write the line into SAUCE_targetSentence/t01/t01.txt
                output_file.write("\n")
    

In [28]:
get_one_txt_in_order('SAUCE_targetSentence/t09')

In [29]:
def participant_txt_loop(folder): #eg, SAUCE_targetSentence

    participant_folders = os.listdir(folder) #t01, t02, etc.
    #print(participant_folders) 

    for par_folder in participant_folders:
        if par_folder != ".DS_Store" and par_folder != ".ipynb_checkpoints":
            par_path = os.path.join(folder, par_folder)
            get_one_txt_in_order(par_path)

In [30]:
participant_txt_loop('SAUCE_targetSentence')

In [31]:
def without_newline_get_one_txt(participant_folder): # eg, SAUCE_targetSentence/t01

    timest_file = 'timestamps_log.txt'

    sauce_less_folder = os.path.split(participant_folder)[1] # t01
    #print(sauce_less_folder)
    participant_txt = f'{sauce_less_folder}.txt' # t01.txt
    final_txt_path = os.path.join(participant_folder, participant_txt) # SAUCE_targetSentence/t01/t01.txt
    #print(final_txt_path)

    int_txts = [] # to make sure we keep them in order when we write them in
    for file in os.listdir(participant_folder):
        if file.endswith('.txt') and file != participant_txt and file != timest_file:
            int_txts.append(int(file.split('.')[0])) # add the int number of txt file
    int_txts.sort() # get int list of files numbered in order
    
    file_list = []
    for file_int in int_txts:
        file_list.append(f'{file_int}.txt') # recreate numbered file names: 1.txt, 2.txt, etc    

    if not os.path.exists(final_txt_path): # if SAUCE_targetSentence/t01/t01.txt not already created
        with open(final_txt_path, 'a') as output_file: # open SAUCE_targetSentence/t01/t01.txt
            for file in file_list: # eg, 1.txt, 2.txt, 3.txt, ...
                
                file_path = os.path.join(participant_folder, file) # eg, SAUCE_targetSentence/t01/1.txt
                
                with open(file_path, 'r') as input_file: # read SAUCE_targetSentence/t01/1.txt
                    text_line = input_file.read()
                    # print(text_line) # print it to check
    
                output_file.write(text_line) # write the line into SAUCE_targetSentence/t01/t01.txt

In [32]:
without_newline_get_one_txt('SAUCE_targetSentence/t14')

In [33]:
without_newline_get_one_txt('SAUCE_targetSentence/t28')

In [34]:
without_newline_get_one_txt('SAUCE_targetSentence/t29')

### Get a column of all the text files in a directory with a substring in their name

Input to this function would be the substring to look for as id, eg, sauce_1_1, and output would be a list of strings, each string being the sentence in that corresponding text file.


In [None]:
def col_from_folder(participant_folder, id_substring, model): 
    # eg, SAUCE_targetSentence/t1, sauce_1_1

    timest_file = 'timestamps_log.txt'

    sauce_less_folder = os.path.split(participant_folder)[1] # t1
    #print(sauce_less_folder)
    participant_txt = f'{sauce_less_folder}.txt' # t01.txt
    final_txt_path = os.path.join(participant_folder, participant_txt) # SAUCE_targetSentence/t01/t01.txt
    #print(final_txt_path)

    int_txts = [] # to make sure we keep them in order when we write them in
    for file in os.listdir(participant_folder):
        if file.endswith('.txt') and file != participant_txt and file != timest_file:
            int_txts.append(int(file.split('.')[0])) # add the int number of txt file
    int_txts.sort() # get int list of files numbered in order
    
    file_list = []
    for file_int in int_txts:
        file_list.append(f'{file_int}.txt') # recreate numbered file names: 1.txt, 2.txt, etc    

    if not os.path.exists(final_txt_path): # if SAUCE_targetSentence/t01/t01.txt not already created
        with open(final_txt_path, 'a') as output_file: # open SAUCE_targetSentence/t01/t01.txt
            for file in file_list: # eg, 1.txt, 2.txt, 3.txt, ...
                
                file_path = os.path.join(participant_folder, file) # eg, SAUCE_targetSentence/t01/1.txt
                
                with open(file_path, 'r') as input_file: # read SAUCE_targetSentence/t01/1.txt
                    text_line = input_file.read()
                    # print(text_line) # print it to check
    
                output_file.write(text_line) # write the line into SAUCE_targetSentence/t01/t01.txt
                output_file.write("\n")