# Get city code and folder name

In [1]:
import pandas as pd
import os
from tqdm import tqdm
import re
import logging
import shutil

# Preprocess data and train/test split

In [None]:
import os
import random
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import librosa
from tqdm import tqdm
import torchaudio
import pandas as pd
from sklearn.utils import shuffle

def get_audio_files(root):
    '''
    root - root directory
    '''
    audio_files = []
    for path, subdirs, files in os.walk(root):
        for name in files:
            if name.endswith('.wav'):
                audio_files.append(os.path.join(path, name))
    return audio_files

def get_city_code(filenames):
    '''
    filenames - list of filenames
    '''
    city_codes = []
    for audio_file in filenames:
        base = os.path.basename(audio_file) # 70366_WG_WJ_404.2.wav
        city = base.split('_')[0]
        city_codes.append(city)
    return city_codes

# root = '/home/projects/vokquant/data/dicla/augmented/'
root = '/home/projects/vokquant/data/dicla/augmented_ofai/'
audio_files = get_audio_files(root)

# read csv file with test speakers
df_test_speakers = pd.read_csv('/home/projects/vokquant/data/dicla/speaker_test_set_ofai.csv', sep='\t')
# get the list of speaker ids
test_speaker_ids = df_test_speakers['Sigle'].tolist()
print("test speakers: ", test_speaker_ids)

# Remove Test data from the audio files
print("all files before split: ", len(audio_files))
train_audio_files = [audio_file for audio_file in audio_files if not any(speaker_id in audio_file for speaker_id in test_speaker_ids)]
test_data = [audio_file for audio_file in audio_files if any(speaker_id in audio_file for speaker_id in test_speaker_ids)]
print("len(train_audio_files): ", len(train_audio_files))
print("len(test_data): ", len(test_data))


# ['AN_MJ', 'WE_WJ', 'AR_MA', 'HC_WJ', 'AL_MJ', 'JS_WA', 'SL_MJ', 'PE_WA', 'BD_MA', 'SU_WJ', 'BX_MJ', 'WD_WA', 'FO_MJ', 'ME_WA', 'AP_MA', 'EZ_WA']
# all files before split:  248489
# len(train_audio_files):  234821
# len(test_data):  13668

['AN_MJ', 'WE_WJ', 'AR_MA', 'HC_WJ', 'AL_MJ', 'JS_WA', 'SL_MJ', 'PE_WA', 'BD_MA', 'SU_WJ', 'BX_MJ', 'WD_WA', 'FO_MJ', 'ME_WA', 'AP_MA', 'EZ_WA', 'WI_WA', 'GU_WJ', 'CS_MJ']
all files before split:  186082
len(train_audio_files):  174192
len(test_data):  11890


In [3]:

test_city_codes = get_city_code(test_data)
# get unique city codes
unique_test_city_codes = list(set(test_city_codes))
print("unique_test_city_codes: ", unique_test_city_codes)
# Shuffle and split the audio files into train and test
# random.shuffle(audio_files)
# train_files = audio_files[:int(0.95 * len(audio_files))]
# test_files = audio_files[int(0.95 * len(audio_files)):]
train_audio_files = shuffle(train_audio_files, random_state=42)
train_files = train_audio_files
test_data = shuffle(test_data, random_state=42)
test_files = test_data

# Get the city codes for train and test files
y_train = get_city_code(train_files)
y_test = get_city_code(test_files)

assert len(y_train) == len(train_files)
assert len(y_test) == len(test_files)

# Encode the city codes
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Split the train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_files, y_train, test_size=0.05, random_state=42)


unique_test_city_codes:  ['41102', '61032', '31401', '10702', '50304', '30860', '80109', '40702', '50621', '10925', '40605', '21002', '62144', '20321', '70710', '70402', '80227', '90001', '70419']


In [4]:
print("len(X_train)", len(X_train))
print("len(X_val)", len(X_val))
print("len(y_train)", len(y_train))
print("len(y_val)", len(y_val))
print("len(test_files)", len(test_files))
print("len(y_test)", len(y_test))
print("len(le.classes_)", len(le.classes_))

# check if city codes in test files are in the train files
for city_code in unique_test_city_codes:
    if city_code in le.classes_:
        continue
    else:
        print(f"Warning: {city_code} is not in the train set")
        

print("le.classes_", le.classes_)


len(X_train) 165482
len(X_val) 8710
len(y_train) 165482
len(y_val) 8710
len(test_files) 11890
len(y_test) 11890
len(le.classes_) 111
le.classes_ ['10401' '10428' '10612' '10702' '10903' '10925' '20321' '20604' '20619'
 '20622' '20914' '21002' '30501' '30719' '30860' '30910' '31035' '31110'
 '31204' '31207' '31401' '31405' '31551' '31617' '31652' '31814' '31916'
 '32002' '32210' '32309' '32324' '32518' '32519' '40402' '40410' '40423'
 '40605' '40621' '40702' '40719' '40806' '40914' '41102' '41342' '41411'
 '41501' '41706' '41804' '50206' '50210' '50212' '50304' '50413' '50423'
 '50502' '50506' '50509' '50612' '50617' '50618' '50621' '50626' '60350'
 '61032' '61115' '61251' '61254' '61257' '61627' '61628' '61743' '61756'
 '62105' '62135' '62144' '62216' '62390' '70208' '70217' '70221' '70326'
 '70334' '70362' '70366' '70402' '70406' '70419' '70504' '70516' '70606'
 '70615' '70622' '70627' '70706' '70709' '70710' '70734' '70804' '70824'
 '70825' '70908' '70920' '80105' '80109' '80128' '80

# Generate csv files

In [5]:
import librosa

def create_csv_file(file_path, X, y):
    with open(file_path, 'w') as f:
        f.write('ID,utt_id,wav,wav_format,text,duration,accent\n')
        for i in tqdm(range(len(X))):
            basename = os.path.basename(X[i])
            basename = basename.split('.wav')[0]
            label = y[i]
            label = le.inverse_transform([label])[0]
            duration = librosa.get_duration(path=X[i])
            duration = round(duration, 3)
            f.write(f'{i},{basename},{X[i]},wav,,{duration},{label}\n')

os.makedirs('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai', exist_ok=True)
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai/train_regions.csv', X_train, y_train)
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai/dev_regions.csv', X_val, y_val)
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai/test_regions.csv', test_files, y_test)

#create devolpment set
os.makedirs('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_dev', exist_ok=True)
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_dev/train_regions.csv', X_train[:1000], y_train[:1000])
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_dev/dev_regions.csv', X_val[:100], y_val[:100])
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_dev/test_regions.csv', test_files[:100], y_test[:100])


100%|██████████| 165482/165482 [00:13<00:00, 12519.56it/s]
100%|██████████| 8710/8710 [00:00<00:00, 12847.39it/s]
100%|██████████| 11890/11890 [00:00<00:00, 12738.24it/s]
100%|██████████| 1000/1000 [00:00<00:00, 12821.52it/s]
100%|██████████| 100/100 [00:00<00:00, 12603.83it/s]
100%|██████████| 100/100 [00:00<00:00, 12309.04it/s]


# Set up Dialektfamilie Dict

In [6]:
import pandas as pd
import os

file_path = '/home/projects/vokquant/data/dicla/DiÖ_PP02_Korpus_Dokumentation_final_Dialaketfamilie.xlsx'
df = pd.read_excel(file_path)

# folder_list = os.listdir('/home/projects/vokquant/data/dicla/augmented/')
mapping_family_dict = dict(zip(df['Gemeindekennziffer'], df['Dialektfamilie']))
# remove nan
city_code_dict = {k: v for k, v in mapping_family_dict.items() if pd.notna(v)}
# 
print(city_code_dict)
#make keys to string and without .0
city_code_dict = {str(k).split('.')[0]: v for k, v in city_code_dict.items()}
print(city_code_dict)

# add [41342] to the city_code_dict as 'Mittelbairisch'
city_code_dict['41342'] = 'Mittelbairisch'


{41501.0: 'Mittelbairisch', 61254.0: 'Süd-/Mittelbairisch', 30501.0: 'Mittelbairisch', 61756.0: 'Süd-/Mittelbairisch', 31401.0: 'Mittelbairisch', 10702.0: 'Süd-/Mittelbairisch', 41102.0: 'Mittelbairisch', 40402.0: '(West-)Mittelbairisch', 70706.0: 'Südbairisch', 40702.0: '(West-)Mittelbairisch', 70710.0: 'Südbairisch', 90001.0: 'Mittelbairisch', 30910.0: 'Mittelbairisch', 41706.0: 'Mittelbairisch', 50304.0: '(West-)Mittelbairisch', 70804.0: 'Bairisch-Alemannisch', 10401.0: 'Süd-/Mittelbairisch', 80105.0: 'Alemannisch', 70504.0: 'Süd-/Mittelbairisch', 62105.0: 'Süd-/Mittelbairisch', 70402.0: 'Süd-/Mittelbairisch', 41804.0: 'Mittelbairisch', 20604.0: 'Südbairisch', 10903.0: 'Süd-/Mittelbairisch', 61627.0: 'Südbairisch', 80212.0: 'Alemannisch', 10502.0: 'Süd-/Mittelbairisch', 21002.0: 'Südbairisch', 70908.0: 'Südbairisch', 80109.0: 'Alemannisch', 70606.0: 'Bairisch-Alemannisch', 40806.0: '(West-)Mittelbairisch', 61628.0: 'Süd-/Mittelbairisch', 40410.0: '(West-)Mittelbairisch', 31810.0: 'M

# Split into 10 seconds

In [11]:
from tqdm import tqdm
import re
import numpy as np
import os
import ipdb

def add_offset_for_long_audios(csv_path, max_audio_duration, make_state_level=True, Dialektfamilie=False, city_code_dict=None):
    with open(csv_path, 'r') as f:
        # if files longer than max_duration seconds, then split them into max_duration seconds
        lines = f.readlines()
        print(len(lines))
        index = 0
        index_list = []
        lines_to_add = []
        line_count = 0
        lines_to_add.append('ID,utt_id,wav,wav_format,text,duration,offset,accent\n')
        make_state_level = make_state_level
        for line in tqdm(lines[1:]):
            # print("line: ", line)
            line_count += 1
            line = line.strip()
            line = line.split(',')
            wav_file = line[2]
            duration = float(line[5])
            # # number 1 is for Burgenland, 2 for Kärnten, 3 for Niederösterreich, 4 for Oberösterreich, 5 for Salzburg, 6 for Steiermark, 7 for Tirol, 8 for Vorarlberg
            if make_state_level==True:
                if line[6][0] == '1':    # check first number of line[6]
                    line[6] = 'bgld'
                elif line[6][0] == '2':
                    line[6] = 'ktn'
                elif line[6][0] == '3':
                    line[6] = 'noe'
                elif line[6][0] == '4':
                    line[6] = 'ooe'
                elif line[6][0] == '5':
                    line[6] = 'sbg'
                elif line[6][0] == '6':
                    line[6] = 'stmk'
                elif line[6][0] == '7':
                    line[6] = 't'
                elif line[6][0] == '8':
                    line[6] = 'vbg'
                elif line[6][0] == '9':
                    line[6] = 'w'
                else:
                    print("Error: No state found")
                    break                  
            if Dialektfamilie==True:
                city_code = line[6]
                # ipdp.set_trace()
                if city_code in city_code_dict:
                    line[6] = city_code_dict[city_code]
                    # print(f"city_code: {city_code} and Dialektfamilie: {city_code_dict[city_code]}")
                else:
                    print(f"Error: {city_code} not found in city_code_dict")
                    break
            # print(f"Duration: {duration} for {wav_file}")
            if duration > max_audio_duration:
                split_files = []
                for i in range(0, int(duration), max_audio_duration):
                    start = i   # = offset_time
                    end = i + max_audio_duration
                    # split_file = wav_file.split('.wav')[0] + f'_{start}_{end}.wav'
                    offset_time = start
                    ##
                    if duration-start>max_audio_duration:
                        # print("duration: max_duration")
                        duration_chunk = max_audio_duration
                    else:
                        # print(f"duration: {duration-start}")
                        duration_chunk = np.round(duration-start, 3)
                    ##
                    # if longer than max_audio then copy name and add offset to offset_column
                    offset_time = start
                    lines_to_add.append(f'{line[0]}, {line[1]}, {line[2]}, {line[3]}, {line[4]}, {duration_chunk}, {offset_time}, {line[6]}\n')
                # handle the last part of the audio file if is less than 1 second
                if duration % max_audio_duration != 0 and duration % max_audio_duration >= 0.5 and duration % max_audio_duration < 1:   # only add for durations like max_duration.6 otherwise it is covered by above. Everything where int(duration) % max_duration >= 1 is covered by above. So this is only for 0.5 seconds segments after max_duration, 20, 30, ...
                    # this will use durations 0.5 to 1 seconds
                    set_min_duration = 1
                    # if set_min_duration >= 1  we do not need this part and can continue
                    # this part was added to handle the case where the last part of the audio file is less than 1 second
                    # but i do not need this anymore, as i only want audios with more than 1 second
                    if set_min_duration < 1:
                        continue
                    else:
                        start = int(duration) - (int(duration) % max_audio_duration)
                        if duration - start >= 0.5:
                            duration_chunk = np.round(duration-start, 3)
                            offset_time = start
                            lines_to_add.append(f'{line[0]}, {line[1]}, {line[2]}, {line[3]}, {line[4]}, {duration_chunk}, {offset_time}, {line[6]}\n')
                        else:
                            # print("Last part of the audio file is less than 0.5 seconds")
                            continue
            else:
                # leave everything as it is and add an offset of 0
                lines_to_add.append(f'{line[0]}, {line[1]}, {line[2]}, {line[3]}, {line[4]}, {line[5]}, 0, {line[6]}\n')
                
    # Separate the header from the data
    header = lines_to_add[0]
    data_lines = lines_to_add[1:]
    print(f"len(data_lines): {len(data_lines)}")
    # check for duration < 0.95
    data_lines_new = []
    for line in data_lines:
        parts = line.split(',')
        # print(float(parts[5]))
        if float(parts[5]) < 0.95:
            # print(f"Error: Duration is less than 0.95 seconds for {parts}")
            continue
        else:
            data_lines_new.append(line)
    data_lines = data_lines_new
    # Reset the ID column, starting from 0
    new_data_lines = []
    for i, line in enumerate(data_lines):
        parts = line.split(',')
        new_id = i  # IDs start from 0
        new_line = f'{new_id},{",".join(parts[1:])}'
        new_data_lines.append(new_line)
        
    # Combine header with new data lines
    updated_lines_to_add = [header] + new_data_lines
    
    # remove double blank spaces
    updated_lines_to_add = [re.sub(' +', '', line) for line in updated_lines_to_add]  # plus means one or more spaces

    return updated_lines_to_add

# Write the updated data to a new file
def write_csv_file(file_path, lines_to_add):
    with open(file_path, 'w') as f:
        for line in lines_to_add:
            f.write(line)
# Use the function
split_duration = 5
make_state_level = False
copy_files_inplace = True
Dialektfamilie = True

train_regions_file = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai/train_regions.csv'
dev_regions_file = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai/dev_regions.csv'
test_regions_file = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai/test_regions.csv'
if make_state_level==True and Dialektfamilie==False:
    save_dir = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_state'
    os.makedirs(save_dir, exist_ok=True)
    updated_lines_to_add_train = add_offset_for_long_audios(train_regions_file, split_duration, make_state_level=True, Dialektfamilie=False)
    updated_lines_to_add_dev = add_offset_for_long_audios(dev_regions_file, split_duration, make_state_level=True, Dialektfamilie=False)
    updated_lines_to_add_test = add_offset_for_long_audios(test_regions_file, split_duration, make_state_level=True, Dialektfamilie=False)
    # save to file
    write_csv_file(f'{save_dir}/train_augmented_offset.csv', updated_lines_to_add_train)
    write_csv_file(f'{save_dir}/dev_augmented_offset.csv', updated_lines_to_add_dev)
    write_csv_file(f'{save_dir}/test_augmented_offset.csv', updated_lines_to_add_test)
    if copy_files_inplace==True:
        write_csv_file(f'{save_dir}/train.csv', updated_lines_to_add_train)
        write_csv_file(f'{save_dir}/dev.csv', updated_lines_to_add_dev)
        write_csv_file(f'{save_dir}/test.csv', updated_lines_to_add_test)
elif make_state_level==False and Dialektfamilie==False:
    save_dir = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_regions'
    os.makedirs(save_dir, exist_ok=True)
    updated_lines_to_add_train = add_offset_for_long_audios(train_regions_file, split_duration, make_state_level=False, Dialektfamilie=False)
    updated_lines_to_add_dev = add_offset_for_long_audios(dev_regions_file, split_duration, make_state_level=False, Dialektfamilie=False)
    updated_lines_to_add_test = add_offset_for_long_audios(test_regions_file, split_duration, make_state_level=False, Dialektfamilie=False)
    # save to file
    write_csv_file(f'{save_dir}/train_augmented_regions_offset.csv', updated_lines_to_add_train)
    write_csv_file(f'{save_dir}/dev_augmented_regions_offset.csv', updated_lines_to_add_dev)
    write_csv_file(f'{save_dir}/test_augmented_regions_offset.csv', updated_lines_to_add_test)
    if copy_files_inplace==True:
        write_csv_file(f'{save_dir}/train.csv', updated_lines_to_add_train)
        write_csv_file(f'{save_dir}/dev.csv', updated_lines_to_add_dev)
        write_csv_file(f'{save_dir}/test.csv', updated_lines_to_add_test)
elif Dialektfamilie==True:
    save_dir = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_family'
    os.makedirs(save_dir, exist_ok=True)
    updated_lines_to_add_train = add_offset_for_long_audios(train_regions_file, split_duration, make_state_level=False, Dialektfamilie=True, city_code_dict=city_code_dict)
    updated_lines_to_add_dev = add_offset_for_long_audios(dev_regions_file, split_duration, make_state_level=False, Dialektfamilie=True, city_code_dict=city_code_dict)
    updated_lines_to_add_test = add_offset_for_long_audios(test_regions_file, split_duration, make_state_level=False, Dialektfamilie=True, city_code_dict=city_code_dict)
    # save to file
    write_csv_file(f'{save_dir}/train_augmented_family_offset.csv', updated_lines_to_add_train)
    write_csv_file(f'{save_dir}/dev_augmented_family_offset.csv', updated_lines_to_add_dev)
    write_csv_file(f'{save_dir}/test_augmented_family_offset.csv', updated_lines_to_add_test)
    if copy_files_inplace==True:
        write_csv_file(f'{save_dir}/train.csv', updated_lines_to_add_train)
        write_csv_file(f'{save_dir}/dev.csv', updated_lines_to_add_dev)
        write_csv_file(f'{save_dir}/test.csv', updated_lines_to_add_test)

165483


  0%|          | 0/165482 [00:00<?, ?it/s]

100%|██████████| 165482/165482 [00:00<00:00, 1575671.71it/s]

len(data_lines): 170190





8711


100%|██████████| 8710/8710 [00:00<00:00, 1557485.84it/s]


len(data_lines): 8951
11891


100%|██████████| 11890/11890 [00:00<00:00, 1709583.99it/s]

len(data_lines): 12156





# Copy unaugmented test files into augmented data folder:

In [None]:
import shutil
copy_not_augmented_test_files = False    # copy test files from not augmented to augmented folder
if copy_not_augmented_test_files==True:
    shutil.copy('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_ofai_regions/test_regions_offset.csv', '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_regions/test.csv')
    shutil.copy('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_ofai_state/test_offset.csv', '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_state/test.csv')
    shutil.copy('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_ofai_dialektfamilie/test_dialektfamilie_offset.csv', '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_family/test.csv')

In [None]:
# create devolpment set
updated_lines_to_add_train_dev = updated_lines_to_add_train[:1000]
updated_lines_to_add_dev_dev = updated_lines_to_add_dev[:100]
updated_lines_to_add_test_dev = updated_lines_to_add_test[:100]

# Write the updated data to a new file
if make_state_level==True:
    save_dir_dev = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_dev'
    os.makedirs(save_dir_dev, exist_ok=True)
    write_csv_file(f'{save_dir_dev}/train_augmented_offset.csv', updated_lines_to_add_train_dev)
    write_csv_file(f'{save_dir_dev}/dev_augmented_offset.csv', updated_lines_to_add_dev_dev)
    write_csv_file(f'{save_dir_dev}/test_augmented_offset.csv', updated_lines_to_add_test_dev)
elif make_state_level==False:
    save_dir_dev = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_dev_regions'
    os.makedirs(save_dir_dev, exist_ok=True)
    write_csv_file(f'{save_dir_dev}/train_augmented_regions_offset.csv', updated_lines_to_add_train_dev)
    write_csv_file(f'{save_dir_dev}/dev_augmented_regions_offset.csv', updated_lines_to_add_dev_dev)
    write_csv_file(f'{save_dir_dev}/test_augmented_regions_offset.csv', updated_lines_to_add_test_dev)

# with open(f'{save_dir_dev}/train_augmented_offset.csv', 'w') as f:
#     for line in updated_lines_to_add_train_dev:
#         f.write(line)
# with open(f'{save_dir_dev}/dev_augmented_offset.csv', 'w') as f:
#     for line in updated_lines_to_add_dev_dev:
#         f.write(line)
# with open(f'{save_dir_dev}/test_augmented_offset.csv', 'w') as f:
#     for line in updated_lines_to_add_test_dev:
#         f.write(line)

# Zip with pigz

In [None]:
# tar -cf - /home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai | pigz -1 > /home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai/at_augmented_ofai.tar.gz
# do with os.system
os.system('tar -cf - /home/projects/vokquant/data/dicla/augmented/ | pigz -1 > /home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai/at_augmented_ofai_mono.tar.gz')


tar: Removing leading `/' from member names


0

# Make Files Mono

In [12]:
import os
import torchaudio
import librosa

# BE CAREFUL: Librosa always outputs only one channel, while torchaudio outputs the number of channels in the audio file

# audio = '/home/projects/vokquant/data/dicla/augmented/TS_WA/80128_TS_WA_454.01.wav'
# audio_librosa, sr = librosa.load(audio, sr=None)
# print(audio_librosa.shape)
# torchaudio_1 = torchaudio.load(audio)
# print(torchaudio_1[0].shape)

folder = '/home/projects/vokquant/data/dicla/augmented/'
count = 0
for path, subdirs, files in os.walk(folder):
    for name in files:
        if name.endswith('.wav'):
            audio_file = os.path.join(path, name)
            audio, sr = torchaudio.load(audio_file)
            if sr != 16000:
                print(f"{audio_file} has sample rate {sr}")
                continue
            if len(audio.shape) > 1:
                # print(f"Converting {audio_file} to mono")
                audio = audio.mean(dim=0, keepdim=True)
                safe_path = os.path.join(path, name)
                # print(f"Saving to {safe_path}")
                torchaudio.save(safe_path, audio, sr)
                # count += 1
            else:
                continue
                # print(f"{audio_file} is already mono")

In [15]:
# get file size in GB for 'home/projects/vokquant/data/dicla/augmented/'
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

size = get_size('/home/projects/vokquant/data/dicla/augmented/')
print("size in GB: ", size / (1024**3))

size in GB:  10.543154884129763


# generate MP3

In [None]:
import os
# upload to cloud
# curl bashupload.com -T /home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai/at_augmented_ofai.tar.gz
os.system('curl bashupload.com -T /home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai/at_augmented_ofai.tar.gz')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
 40 8068M    0     0   40 3264M      0  5425k  0:25:22  0:10:16  0:15:06 5034k

<html>
<head><title>500 Internal Server Error</title></head>
<body bgcolor="white">
<center><h1>500 Internal Server Error</h1></center>
<hr><center>nginx</center>
</body>
</html>


 40 8068M    0   186   40 3266M      0  5425k  0:25:22  0:10:16  0:15:06 5188k


0

In [None]:
# make to mp3

folder = '/home/projects/vokquant/data/dicla/augmented/'
count = 0
for path, subdirs, files in os.walk(folder):
    for name in files:
        if name.endswith('.wav'):
            audio_file = os.path.join(path, name)
            audio, sr = torchaudio.load(audio_file)
            # safe as mp3
            # convert
            mp3_file = audio_file.replace('.wav', '.mp3')
            new_safe_path = audio_file.replace('augmented', 'augmented_mp3')
            print(f"New safe path: {new_safe_path}")
            # torchaudio.save(mp3_file, audio, sr, format='mp3')
            

# workaround for files without augmentation
I want to have the same train/test set and just remove the spk_xx from the filenames

In [None]:
import pandas as pd
import re

stage = 'test'

# Load the CSV file
file_path = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_regions/' + stage + '.csv'
df = pd.read_csv(file_path)

# Define a function to remove the "_spk_(number)" pattern
def remove_spk_number(text):
    text = re.sub(r'_spk_\d+', '', text)
    text = re.sub('augmented', 'processed_16khz_renamed_ofai', text)
    return text

# Apply the function to the 'utt_id' and 'wav' columns
df['utt_id'] = df['utt_id'].apply(remove_spk_number)
df['wav'] = df['wav'].apply(remove_spk_number)

# Save the modified CSV file
output_path = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_not_augmented_regions/' + stage + '.csv'
output_path = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_not_augmented_regions/' + stage + '_offset.csv'

write = False
if write:
    df.to_csv(output_path, index=False)

print("Pattern removed and file saved as", output_path)


Pattern removed and file saved as /home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_not_augmented_regions/test_offset.csv


# 2 seconds for test files

In [2]:
test_csv = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_regions/test.csv'
new_max_duration = 2.0
new_data_lines = []
with open(test_csv, 'r') as f:
    lines = f.readlines()
    header = lines[0]
    data = lines[1:]
    for line in data:
        parts = line.split(',')
        wav_file = parts[2]
        duration = float(parts[5])
        if duration < new_max_duration:
            # print(f"Error: Duration is less than 2 seconds for {wav_file}")
            continue
        else:
            new_data_lines.append(line)
            # continue

print("new_data_lines: ", new_data_lines)
print("len(new_data_lines): ", len(new_data_lines))
# write to csv 
with open('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_regions/test_' + str(int(new_max_duration)) + 'sec_offset.csv', 'w') as f:
    f.write(header)
    for line in new_data_lines:
        f.write(line)


new_data_lines:  ['1,90001_WI_WA_falter061001bis061231_003637,/home/projects/vokquant/data/dicla/processed_16khz_renamed_ofai/WI_WA/90001_WI_WA_falter061001bis061231_003637.wav,wav,,3.11,0,90001\n', '2,50304_BD_MA_441.01,/home/projects/vokquant/data/dicla/processed_16khz_renamed_ofai/BD_MA/50304_BD_MA_441.01.wav,wav,,2.045,0,50304\n', '5,10702_AP_MA_351.01,/home/projects/vokquant/data/dicla/processed_16khz_renamed_ofai/AP_MA/10702_AP_MA_351.01.wav,wav,,2.48,0,10702\n', '6,10925_EZ_WA_ex_14_7,/home/projects/vokquant/data/dicla/processed_16khz_renamed_ofai/EZ_WA/10925_EZ_WA_ex_14_7.wav,wav,,2.194,0,10925\n', '7,50621_SU_WJ_441.10_4,/home/projects/vokquant/data/dicla/processed_16khz_renamed_ofai/SU_WJ/50621_SU_WJ_441.10_4.wav,wav,,2.295,0,50621\n', '13,21002_PE_WA_010.01_8,/home/projects/vokquant/data/dicla/processed_16khz_renamed_ofai/PE_WA/21002_PE_WA_010.01_8.wav,wav,,5.147,0,21002\n', '14,62144_AL_MJ_525.01_ATL,/home/projects/vokquant/data/dicla/processed_16khz_renamed_ofai/AL_MJ/6214

# Count average amount of utterances per Speaker

In [None]:
import numpy as np
test_csv = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_regions/train.csv'
speaker_list = []
with open(test_csv, 'r') as f:
    lines = f.readlines()
    header = lines[0]
    data = lines[1:]
    for line in data:
        parts = line.split(',')
        utt_id = '_'.join(parts[1].split('_')[1:3])
        speaker_list.append(utt_id)

speakers = list(set(speaker_list))
print("len(speakers): ", len(speakers))

speaker_count = {} # count utterances per speaker
for speaker in speakers:
    count = speaker_list.count(speaker)
    speaker_count[speaker] = count
    
print("speaker_count: ", speaker_count)
average = sum(speaker_count.values()) / len(speaker_count)
print("average number of utterances per speaker: ", np.round(average, 1))


len(speakers):  279
speaker_count:  {'GP_MJ': 578, 'EI_WA': 680, 'RC_MA': 505, 'LE_MA': 521, 'AT_MJ': 676, 'GM_MA': 561, 'GY_MJ': 461, 'GH_WA': 556, 'NA_MJ': 664, 'FO_MA': 365, 'RH_WA': 559, 'KS_WJ': 456, 'GY_MA': 704, 'IR_MA': 644, 'ML_WA': 532, 'HR_MA': 450, 'RH_MJ': 646, 'SP_MA': 611, 'MI_MA': 540, 'PK_WA': 689, 'VA_MJ': 391, 'HG_WJ': 473, 'PE_MA': 673, 'TU_WJ': 331, 'RE_MJ': 603, 'VR_MJ': 597, 'WE_WA': 658, 'HU_WA': 618, 'WG_MA': 629, 'PE_WJ': 449, 'ST_WJ': 425, 'WZ_WJ': 641, 'RB_WA': 448, 'UB_WA': 623, 'WT_WJ': 595, 'HU_MJ': 383, 'RS_WJ': 396, 'HO_WA': 513, 'PU_WJ': 638, 'UW_WA': 739, 'HU_WJ': 343, 'ML_WJ': 365, 'TK_WA': 765, 'HO_WJ': 392, 'SS_WJ2': 403, 'MO_MA': 640, 'FA_MA': 508, 'WE_MJ': 465, 'TS_WJ': 421, 'RA_MA': 672, 'AB_WA': 617, 'MO_WJ': 391, 'MA_MA': 456, 'NA_MA': 653, 'RR_MA': 645, 'FA_MJ': 376, 'KO_MJ': 661, 'WN_MA': 465, 'DB_MA': 509, 'TS_MA': 366, 'DB_MJ': 638, 'LS_WA': 853, 'IR_MJ': 478, 'GT_MA': 627, 'AV_WA': 770, 'GL_MA': 578, 'VI_MA': 695, 'LI_MA': 493, 'AD_MJ': 4

In [None]:
import numpy as np
train_csv = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_regions/train.csv'
val_csv = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_regions/dev.csv'
test_csv = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_ofai_regions/test.csv'

# combine
with open(train_csv, 'r') as f:
    train_lines = f.readlines()
    train_header = train_lines[0]
    train_data = train_lines[1:]
with open(val_csv, 'r') as f:
    val_lines = f.readlines()
    val_header = val_lines[0]
    val_data = val_lines[1:]
with open(test_csv, 'r') as f:
    test_lines = f.readlines()
    test_header = test_lines[0]
    test_data = test_lines[1:]
# combine to one list
all_csv = [train_header] + train_data + val_data + test_data

# get duration per speaker
# duration_dict = {}
# with open(test_csv, 'r') as f:
#     lines = f.readlines()
#     header = lines[0]
#     data = lines[1:]
#     for line in data:
#         parts = line.split(',')
#         utt_id = '_'.join(parts[1].split('_')[1:3])
#         duration = float(parts[5])
#         if utt_id in duration_dict:
#             duration_dict[utt_id] += duration
#         else:
#             duration_dict[utt_id] = duration
# repeat for all_csv
duration_dict = {}
for line in all_csv:
    parts = line.split(',')
    utt_id = '_'.join(parts[1].split('_')[1:3])
    duration = float(parts[5])
    if utt_id in duration_dict:
        duration_dict[utt_id] += duration
    else:
        duration_dict[utt_id] = duration
print("duration_dict: ", duration_dict)
average_duration = sum(duration_dict.values()) / len(duration_dict)
print("average duration per speaker: ", np.round(average_duration, 1))

duration_dict:  {'ST_MA': 1550.298999999998, 'SQ_MJ': 1310.3589999999995, 'WE_MJ': 912.1859999999994, 'PU_MA': 1069.7269999999996, 'HU_WJ': 760.0639999999999, 'HU_WA': 1404.2719999999993, 'TK_MJ': 692.5839999999989, 'HO_WJ': 852.8789999999996, 'BR_WJ': 1188.883, 'VI_MA': 1444.6809999999994, 'NS_WA': 1797.2559999999974, 'KS_WA': 1301.7810000000006, 'KG_MJ': 917.6149999999996, 'NS_MA': 1489.302, 'KA_WJ': 1156.1989999999987, 'SR_MJ': 1171.1229999999998, 'BD_WA': 1222.3889999999985, 'AV_WA': 1878.2500000000002, 'HR_MJ': 888.1409999999998, 'LS_WJ': 856.6329999999995, 'BD_MJ': 919.3949999999999, 'MI_WJ': 1424.681999999999, 'GW_WJ': 1028.1160000000004, 'MT_WA': 942.5879999999989, 'KO_WA': 1568.6939999999975, 'PU_WA': 1481.7959999999982, 'AP_MJ': 996.5700000000005, 'HB_MJ': 968.6039999999989, 'VR_MJ': 1068.5239999999992, 'WP_MA': 1590.6770000000001, 'LI_WJ': 1083.268999999999, 'AB_WJ': 908.0249999999999, 'SW_MJ': 1062.8849999999982, 'NS_MJ': 859.9780000000013, 'UW_MA': 1242.119999999997, 'SU_M