In [6]:
# all imports
import pandas as pd
import csv
import os
from pydub import AudioSegment

In [7]:
def read_and_clean_session_results(session_file_name):
    
    # Read the csv file with session data from session file name, 
    # relevant file will be called results.csv
    # Example file path input:
    # '/Users/oishanibandopadhyay/Documents/Revising Honors Project/session-67d94e1a42f12a9364065df2-data'
    results_file_name = os.path.join(session_file_name, 'results.csv')
    df = pd.read_csv(results_file_name)

    # Dropping irrelevant columns for analysis
    df.drop(['expt_id', 'group_id', 'network_error_repeat', 'participation_duration', 'response_correct', 'response_mode', 'response_rt', 'response_target', 'session_start_time', 'trial_duration', 'trial_num', 'trial_template'], axis='columns')

    # Re-order columns for better readability
    clean_df = df[['session_id', 'participant_id', 'stimuli_presented', 'response_type', 'response_name', 'response_value']]

    # Return cleaned dataframe
    return clean_df

# Test run
df = read_and_clean_session_results('/Users/oishanibandopadhyay/Documents/Revising Honors Project/session-67d94e1a42f12a9364065df2-data')
df

Unnamed: 0,session_id,participant_id,stimuli_presented,response_type,response_name,response_value
0,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQ1,choice-response,LangBgOptionsResp,['Other (you may specify in the next slide)']
1,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQOther,text-response,LangBgOtherResp,"Korean, Spanish"
2,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQ2,choice-response,LangBgOptionsResp,['Other (you may specify in the next slide)']
3,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQOther,text-response,LangBgOtherResp,Korean
4,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQ3,choice-response,LangBgOptionsResp,['None besides English']
...,...,...,...,...,...,...
250,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt39,audio-response,LocPromptResp39,53c7dbc41f259d6f7157d364-81-LocPromptResp39.mp4
251,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt22,audio-response,LocPromptResp22,53c7dbc41f259d6f7157d364-82-LocPromptResp22.mp4
252,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt29,audio-response,LocPromptResp29,53c7dbc41f259d6f7157d364-83-LocPromptResp29.mp4
253,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt27,audio-response,LocPromptResp27,53c7dbc41f259d6f7157d364-84-LocPromptResp27.mp4


In [8]:
def add_lang_bg_3(df):

    # first, we make a filter with the stimuli being LangBgQ3
    lang_bg_q3_filtered = df[df.loc[:, 'stimuli_presented'] == 'LangBgQ3']
    lang_bg_q3_filtered
    
    # next, we set the index to be participant_id for the mapping
    lang_bg_q3_indexed = lang_bg_q3_filtered.set_index('participant_id')
    lang_bg_q3_indexed
    
    
    # then, we get out the 'response_value' column for the mapping
    lang_bg_q3_map = lang_bg_q3_indexed['response_value']
    lang_bg_q3_map
    
    # then, we use the map function 
    df.loc[:, 'lang_bg_q3_resp'] = df.loc[:, 'participant_id'].map(lang_bg_q3_map)
    df
    
    # next, we're going to add a column for lang_bg_q3_english_only as a binary encoder
    df.loc[:, 'lang_bg_q3_english_only'] = df.loc[:, 'lang_bg_q3_resp'].apply(lambda x: isinstance(x, str) and 'None besides English' in x)
    df

    return df

# Test run
df = add_lang_bg_3(df)
df

Unnamed: 0,session_id,participant_id,stimuli_presented,response_type,response_name,response_value,lang_bg_q3_resp,lang_bg_q3_english_only
0,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQ1,choice-response,LangBgOptionsResp,['Other (you may specify in the next slide)'],['None besides English'],True
1,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQOther,text-response,LangBgOtherResp,"Korean, Spanish",['None besides English'],True
2,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQ2,choice-response,LangBgOptionsResp,['Other (you may specify in the next slide)'],['None besides English'],True
3,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQOther,text-response,LangBgOtherResp,Korean,['None besides English'],True
4,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQ3,choice-response,LangBgOptionsResp,['None besides English'],['None besides English'],True
...,...,...,...,...,...,...,...,...
250,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt39,audio-response,LocPromptResp39,53c7dbc41f259d6f7157d364-81-LocPromptResp39.mp4,['Other (you may specify in the next slide)'],False
251,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt22,audio-response,LocPromptResp22,53c7dbc41f259d6f7157d364-82-LocPromptResp22.mp4,['Other (you may specify in the next slide)'],False
252,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt29,audio-response,LocPromptResp29,53c7dbc41f259d6f7157d364-83-LocPromptResp29.mp4,['Other (you may specify in the next slide)'],False
253,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt27,audio-response,LocPromptResp27,53c7dbc41f259d6f7157d364-84-LocPromptResp27.mp4,['Other (you may specify in the next slide)'],False


In [9]:
def add_prompts_from_stimulus_definitions(df):

    # read in stimulus definitions file
    stimulus_definitions = pd.read_csv('/Users/oishanibandopadhyay/Documents/Revising Honors Project/stimulus-definitions.csv')

    # drop irrelevant type column
    stimulus_definitions = stimulus_definitions.drop(['type'], axis='columns')

    # reorder and rename columns and set name as the index to later merge
    stimulus_definitions = stimulus_definitions[['name', 'content']]
    stimulus_definitions = stimulus_definitions.set_index('name')
    stimulus_definitions = stimulus_definitions.rename(columns = {'content': 'prompt_text'})

    # merge prompt_name column to the df
    df = df.merge(stimulus_definitions, left_on = 'stimuli_presented', right_index = True, how = 'left')

    return df

# Test run
df = add_prompts_from_stimulus_definitions(df)
df

Unnamed: 0,session_id,participant_id,stimuli_presented,response_type,response_name,response_value,lang_bg_q3_resp,lang_bg_q3_english_only,prompt_text
0,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQ1,choice-response,LangBgOptionsResp,['Other (you may specify in the next slide)'],['None besides English'],True,"Which languages, if any, do you speak besides ..."
1,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQOther,text-response,LangBgOtherResp,"Korean, Spanish",['None besides English'],True,"If you selected Other, please name the other l..."
2,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQ2,choice-response,LangBgOptionsResp,['Other (you may specify in the next slide)'],['None besides English'],True,Which languages were spoken around you where y...
3,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQOther,text-response,LangBgOtherResp,Korean,['None besides English'],True,"If you selected Other, please name the other l..."
4,67d94e1a42f12a9364065df2,b5bdd1b55fdf1a07a1cc3907eb6ade997ee127e8708b58...,LangBgQ3,choice-response,LangBgOptionsResp,['None besides English'],['None besides English'],True,Which languages do you think have an influence...
...,...,...,...,...,...,...,...,...,...
250,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt39,audio-response,LocPromptResp39,53c7dbc41f259d6f7157d364-81-LocPromptResp39.mp4,['Other (you may specify in the next slide)'],False,She is leaving for Savannah tomorrow.
251,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt22,audio-response,LocPromptResp22,53c7dbc41f259d6f7157d364-82-LocPromptResp22.mp4,['Other (you may specify in the next slide)'],False,They knew where Anirudh would be on a Friday e...
252,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt29,audio-response,LocPromptResp29,53c7dbc41f259d6f7157d364-83-LocPromptResp29.mp4,['Other (you may specify in the next slide)'],False,She is leaving for Calcutta tomorrow.
253,67d94e1a42f12a9364065df2,9e4c7d9ba7377ea14d0e228c38df883deaf62f3f4d1837...,LocPrompt27,audio-response,LocPromptResp27,53c7dbc41f259d6f7157d364-84-LocPromptResp27.mp4,['Other (you may specify in the next slide)'],False,"Macha what are you doing, come off to the movi..."


In [5]:
def session_audio_to_wav(session_name):

    # finding audio folder inside session-xyz_data folder, not the audio folder inside it
    input_audio_folder = os.path.join(session_name, 'audio')
    
    # creating new folder for output pre-processed wav files in session-xyz_data folder
    output_audio_folder = os.path.join(session_name, 'audio_wav')
    os.makedirs(output_audio_folder, exist_ok = True)
    
    # iterating through input audio folder
    for i in os.listdir(input_audio_folder):
        # each audio file's full input path
        #full_input_path = f'{input_audio_folder}/{i}'
        full_input_path = os.path.join(input_audio_folder, i)

        # check if file is empty or too small to be processed
        if os.path.getsize(full_input_path) < 1024:
            print(f'\n Input audio file too small or empty: {full_input_path} \n')
            continue

        # checking if the audio file ends with .wav already
        file_path_without_extension = os.path.splitext(i)[0]
        file_path_with_wav = '{output_folder}/{path}.wav'.format(
            output_folder = output_audio_folder,
            path = file_path_without_extension)

        if os.path.exists(file_path_with_wav):
            print(f'Wav converted file exists at: {file_path_with_wav}')

        # if .wav version of input file does not exist in audio_wav folder, make it
        else:
            try:
                # using AudioSegment from pydub to get the audio from the input file
                audio = AudioSegment.from_file(os.path.join(input_audio_folder, i))

                # covert all audio files to mono (accounts for earphone recordings
                # which can have stereo audio or more than one channel)
                audio.set_channels(1)
                
                # sending the audio file converted to wav to the previously named wav filepath
                audio.export(f'{file_path_with_wav}')
                print(f"Audio file converted successfully: \n {file_path_with_wav}")

            # handle errors with the files, printing only the last line of the error
            except Exception as e:
                print(f'Error converting {full_input_path}: {e}')

# Testing
session_audio_to_wav('/Users/oishanibandopadhyay/Documents/Revising Honors Project/session-67d94e1a42f12a9364065df2-data')

Wav converted file exists at: /Users/oishanibandopadhyay/Documents/Revising Honors Project/session-67d94e1a42f12a9364065df2-data/audio_wav/c3281b525f62d78af053c76e-17-LocPromptResp31.wav
Wav converted file exists at: /Users/oishanibandopadhyay/Documents/Revising Honors Project/session-67d94e1a42f12a9364065df2-data/audio_wav/c3281b525f62d78af053c76e-48-ExpPromptResp6.wav
Wav converted file exists at: /Users/oishanibandopadhyay/Documents/Revising Honors Project/session-67d94e1a42f12a9364065df2-data/audio_wav/c3281b525f62d78af053c76e-58-DialPromptResp13.wav
Wav converted file exists at: /Users/oishanibandopadhyay/Documents/Revising Honors Project/session-67d94e1a42f12a9364065df2-data/audio_wav/ef360fc757c3a1d77fdf1b72-33-HardPromptResp11.wav
Wav converted file exists at: /Users/oishanibandopadhyay/Documents/Revising Honors Project/session-67d94e1a42f12a9364065df2-data/audio_wav/ef360fc757c3a1d77fdf1b72-17-LocPromptResp31.wav
Wav converted file exists at: /Users/oishanibandopadhyay/Documen