# Get city code and folder name

In [5]:
# original filename: nn_baseline2.ipynb
import pandas as pd
import os
from tqdm import tqdm
import re
import logging
import shutil

# Preprocess data and train/test split

In [6]:
import os
import random
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import librosa
from tqdm import tqdm
import torchaudio
import pandas as pd
from sklearn.utils import shuffle

def get_audio_files(root):
    '''
    root - root directory
    '''
    audio_files = []
    for path, subdirs, files in os.walk(root):
        for name in files:
            if name.endswith('.wav'):
                audio_files.append(os.path.join(path, name))
    return audio_files

def get_city_code(filenames):
    '''
    filenames - list of filenames
    '''
    city_codes = []
    for audio_file in filenames:
        base = os.path.basename(audio_file) # 70366_WG_WJ_404.2.wav
        city = base.split('_')[0]
        city_codes.append(city)
    return city_codes

root = '/home/projects/vokquant/data/dicla/augmented/'
audio_files = get_audio_files(root)

# read csv file with test speakers
df_test_speakers = pd.read_csv('/home/projects/vokquant/data/dicla/speaker_test_set.csv', sep='\t')
# get the list of speaker ids
speaker_ids = df_test_speakers['Sigle'].tolist()
print(speaker_ids)

# Remove Test data from the audio files
print("all files before split: ", len(audio_files))
train_audio_files = [audio_file for audio_file in audio_files if not any(speaker_id in audio_file for speaker_id in speaker_ids)]
test_data = [audio_file for audio_file in audio_files if any(speaker_id in audio_file for speaker_id in speaker_ids)]
print("len(train_audio_files): ", len(train_audio_files))
print("len(test_data): ", len(test_data))




['AN_MJ', 'WE_WJ', 'AR_MA', 'HC_WJ', 'AL_MJ', 'JS_WA', 'SL_MJ', 'PE_WA', 'BD_MA', 'SU_WJ', 'BX_MJ', 'WD_WA', 'FO_MJ', 'ME_WA', 'AP_MA', 'EZ_WA']
all files before split:  169389
len(train_audio_files):  163671
len(test_data):  5718


In [7]:

test_city_codes = get_city_code(test_data)
# get unique city codes
unique_test_city_codes = list(set(test_city_codes))
print("unique_test_city_codes: ", unique_test_city_codes)
# Shuffle and split the audio files into train and test
# random.shuffle(audio_files)
# train_files = audio_files[:int(0.95 * len(audio_files))]
# test_files = audio_files[int(0.95 * len(audio_files)):]
train_audio_files = shuffle(train_audio_files, random_state=42)
train_files = train_audio_files
test_data = shuffle(test_data, random_state=42)
test_files = test_data

# Get the city codes for train and test files
y_train = get_city_code(train_files)
y_test = get_city_code(test_files)

assert len(y_train) == len(train_files)
assert len(y_test) == len(test_files)

# Encode the city codes
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Split the train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_files, y_train, test_size=0.05, random_state=42)


unique_test_city_codes:  ['20321', '70402', '70419', '10702', '80227', '50304', '10925', '40605', '50621', '62144', '41102', '61032', '31401', '80109', '30860', '21002']


In [8]:
print("len(X_train)", len(X_train))
print("len(X_val)", len(X_val))
print("len(y_train)", len(y_train))
print("len(y_val)", len(y_val))
print("len(test_files)", len(test_files))
print("len(y_test)", len(y_test))
print("len(le.classes_)", len(le.classes_))

# check if city codes in test files are in the train files
for city_code in unique_test_city_codes:
    if city_code in le.classes_:
        continue
    else:
        print(f"Warning: {city_code} is not in the train set")
        

print("le.classes_", le.classes_)


len(X_train) 155487
len(X_val) 8184
len(y_train) 155487
len(y_val) 8184
len(test_files) 5718
len(y_test) 5718
len(le.classes_) 109
le.classes_ ['10401' '10428' '10612' '10702' '10903' '10925' '20321' '20604' '20619'
 '20622' '20914' '21002' '30501' '30719' '30860' '30910' '31035' '31110'
 '31204' '31207' '31401' '31405' '31551' '31617' '31652' '31814' '31916'
 '32002' '32210' '32309' '32324' '32518' '32519' '40402' '40410' '40423'
 '40605' '40621' '40702' '40719' '40806' '40914' '41102' '41342' '41411'
 '41501' '41706' '41804' '50206' '50210' '50212' '50304' '50413' '50423'
 '50502' '50506' '50509' '50612' '50617' '50618' '50621' '50626' '60350'
 '61032' '61115' '61251' '61254' '61257' '61627' '61628' '61743' '61756'
 '62105' '62135' '62144' '62216' '62390' '70208' '70217' '70221' '70326'
 '70334' '70362' '70366' '70402' '70406' '70419' '70504' '70516' '70606'
 '70615' '70622' '70627' '70706' '70709' '70734' '70804' '70824' '70825'
 '70908' '70920' '80105' '80109' '80128' '80212' '8022

# Generate csv files

In [12]:
import librosa

def create_csv_file(file_path, X, y):
    with open(file_path, 'w') as f:
        f.write('ID,utt_id,wav,wav_format,text,duration,accent\n')
        for i in tqdm(range(len(X))):
            basename = os.path.basename(X[i])
            basename = basename.split('.wav')[0]
            label = y[i]
            label = le.inverse_transform([label])[0]
            duration = librosa.get_duration(path=X[i])
            duration = round(duration, 3)
            f.write(f'{i},{basename},{X[i]},wav,,{duration},{label}\n')

os.makedirs('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented', exist_ok=True)
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/train_regions.csv', X_train, y_train)
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/dev_regions.csv', X_val, y_val)
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/test_regions.csv', test_files, y_test)

#create devolpment set
os.makedirs('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_dev', exist_ok=True)
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_dev/train_regions.csv', X_train[:1000], y_train[:1000])
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_dev/dev_regions.csv', X_val[:100], y_val[:100])
create_csv_file('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_dev/test_regions.csv', test_files[:100], y_test[:100])


100%|██████████| 155487/155487 [00:12<00:00, 12819.03it/s]
100%|██████████| 8184/8184 [00:00<00:00, 12990.34it/s]
100%|██████████| 5718/5718 [00:00<00:00, 13069.47it/s]
100%|██████████| 1000/1000 [00:00<00:00, 12971.97it/s]
100%|██████████| 100/100 [00:00<00:00, 12140.86it/s]
100%|██████████| 100/100 [00:00<00:00, 12298.93it/s]


# Split into 10 seconds

In [16]:
from tqdm import tqdm
import re
import numpy as np
import os


def add_offset_for_long_audios(csv_path, max_audio_duration, make_state_level=True):
    with open(csv_path, 'r') as f:
        # if files longer than 10 seconds, then split them into 10 seconds
        lines = f.readlines()
        print(len(lines))
        index = 0
        index_list = []
        lines_to_add = []
        line_count = 0
        lines_to_add.append('ID,utt_id,wav,wav_format,text,duration,offset,accent\n')
        make_state_level = make_state_level
        for line in tqdm(lines[1:]):
            # print("line: ", line)
            line_count += 1
            line = line.strip()
            line = line.split(',')
            wav_file = line[2]
            duration = float(line[5])
            # # number 1 is for Burgenland, 2 for Kärnten, 3 for Niederösterreich, 4 for Oberösterreich, 5 for Salzburg, 6 for Steiermark, 7 for Tirol, 8 for Vorarlberg
            if make_state_level==True:
                if line[6][0] == '1':    # check first number of line[6]
                    line[6] = 'bgld'
                elif line[6][0] == '2':
                    line[6] = 'ktn'
                elif line[6][0] == '3':
                    line[6] = 'noe'
                elif line[6][0] == '4':
                    line[6] = 'ooe'
                elif line[6][0] == '5':
                    line[6] = 'sbg'
                elif line[6][0] == '6':
                    line[6] = 'stmk'
                elif line[6][0] == '7':
                    line[6] = 't'
                elif line[6][0] == '8':
                    line[6] = 'vbg'
                elif line[6][0] == '9':
                    line[6] = 'w'
                else:
                    print("Error: No state found")
                    break                  
                
            # print(f"Duration: {duration} for {wav_file}")
            if duration > max_audio_duration:
                split_files = []
                for i in range(0, int(duration), max_audio_duration):
                    start = i   # = offset_time
                    end = i + max_audio_duration
                    # split_file = wav_file.split('.wav')[0] + f'_{start}_{end}.wav'
                    offset_time = start
                    ##
                    if duration-start>10:
                        # print("duration: 10")
                        duration_chunk = max_audio_duration
                    else:
                        # print(f"duration: {duration-start}")
                        duration_chunk = np.round(duration-start, 3)
                    ##
                    # if longer than max_audio then copy name and add offset to offset_column
                    offset_time = start
                    lines_to_add.append(f'{line[0]}, {line[1]}, {line[2]}, {line[3]}, {line[4]}, {duration_chunk}, {offset_time}, {line[6]}\n')
                # handle the last part of the audio file if is less than 1 second
                if duration % max_audio_duration != 0 and duration % max_audio_duration >= 0.5 and duration % max_audio_duration < 1:   # only add for durations like 10.6 otherwise it is covered by above. Everything where int(duration) % 10 >= 1 is covered by above. So this is only for 0.5 seconds segments after 10, 20, 30, ...
                    # this will use durations 0.5 to 1 seconds
                    set_min_duration = 1
                    # if set_min_duration >= 1  we do not need this part and can continue
                    # this part was added to handle the case where the last part of the audio file is less than 1 second
                    # but i do not need this anymore, as i only want audios with more than 1 second
                    if set_min_duration < 1:
                        continue
                    else:
                        start = int(duration) - (int(duration) % max_audio_duration)
                        if duration - start >= 0.5:
                            duration_chunk = np.round(duration-start, 3)
                            offset_time = start
                            lines_to_add.append(f'{line[0]}, {line[1]}, {line[2]}, {line[3]}, {line[4]}, {duration_chunk}, {offset_time}, {line[6]}\n')
                        else:
                            # print("Last part of the audio file is less than 0.5 seconds")
                            continue
            else:
                # leave everything as it is and add an offset of 0
                lines_to_add.append(f'{line[0]}, {line[1]}, {line[2]}, {line[3]}, {line[4]}, {line[5]}, 0, {line[6]}\n')
                
    # Separate the header from the data
    header = lines_to_add[0]
    data_lines = lines_to_add[1:]
    # check for duration < 0.95
    data_lines_new = []
    for line in data_lines:
        parts = line.split(',')
        # print(float(parts[5]))
        if float(parts[5]) < 0.95:
            # print(f"Error: Duration is less than 0.95 seconds for {parts}")
            continue
        else:
            data_lines_new.append(line)
    data_lines = data_lines_new
    # Reset the ID column, starting from 0
    new_data_lines = []
    for i, line in enumerate(data_lines):
        parts = line.split(',')
        new_id = i  # IDs start from 0
        new_line = f'{new_id},{",".join(parts[1:])}'
        new_data_lines.append(new_line)
        
    # Combine header with new data lines
    updated_lines_to_add = [header] + new_data_lines
    
    # remove double blank spaces
    updated_lines_to_add = [re.sub(' +', '', line) for line in updated_lines_to_add]  # plus means one or more spaces

    return updated_lines_to_add

# Write the updated data to a new file
def write_csv_file(file_path, lines_to_add):
    with open(file_path, 'w') as f:
        for line in lines_to_add:
            f.write(line)
# Use the function
split_duration = 10
make_state_level = False
copy_files_inplace = True

if make_state_level==True:
    save_dir = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented'
    os.makedirs(save_dir, exist_ok=True)
    updated_lines_to_add_train = add_offset_for_long_audios(os.path.join(save_dir, 'train_regions.csv'), split_duration, make_state_level=True)
    updated_lines_to_add_dev = add_offset_for_long_audios(os.path.join(save_dir, 'dev_regions.csv'), split_duration, make_state_level=True)
    updated_lines_to_add_test = add_offset_for_long_audios(os.path.join(save_dir, 'test_regions.csv'), split_duration, make_state_level=True)
    # save to file
    write_csv_file(f'{save_dir}/train_augmented_offset.csv', updated_lines_to_add_train)
    write_csv_file(f'{save_dir}/dev_augmented_offset.csv', updated_lines_to_add_dev)
    write_csv_file(f'{save_dir}/test_augmented_offset.csv', updated_lines_to_add_test)
    if copy_files_inplace==True:
        write_csv_file(f'{save_dir}/train.csv', updated_lines_to_add_train)
        write_csv_file(f'{save_dir}/dev.csv', updated_lines_to_add_dev)
        write_csv_file(f'{save_dir}/test.csv', updated_lines_to_add_test)
elif make_state_level==False:
    save_dir = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_regions'
    os.makedirs(save_dir, exist_ok=True)
    updated_lines_to_add_train = add_offset_for_long_audios(os.path.join(save_dir, 'train_regions.csv'), split_duration, make_state_level=False)
    updated_lines_to_add_dev = add_offset_for_long_audios(os.path.join(save_dir, 'dev_regions.csv'), split_duration, make_state_level=False)
    updated_lines_to_add_test = add_offset_for_long_audios(os.path.join(save_dir, 'test_regions.csv'), split_duration, make_state_level=False)
    # save to file
    write_csv_file(f'{save_dir}/train_augmented_state_offset.csv', updated_lines_to_add_train)
    write_csv_file(f'{save_dir}/dev_augmented_state_offset.csv', updated_lines_to_add_dev)
    write_csv_file(f'{save_dir}/test_augmented_state_offset.csv', updated_lines_to_add_test)
    if copy_files_inplace==True:
        write_csv_file(f'{save_dir}/train.csv', updated_lines_to_add_train)
        write_csv_file(f'{save_dir}/dev.csv', updated_lines_to_add_dev)
        write_csv_file(f'{save_dir}/test.csv', updated_lines_to_add_test)

# with open(f'{save_dir}/train_augmented_offset.csv', 'w') as f:
#     for line in updated_lines_to_add_train:
#         f.write(line)
# with open(f'{save_dir}/dev_augmented_offset.csv', 'w') as f:
#     for line in updated_lines_to_add_dev:
#         f.write(line)
# with open(f'{save_dir}/test_augmented_offset.csv', 'w') as f:
#     for line in updated_lines_to_add_test:
#         f.write(line)
# copy_files_inplace = False
# if copy_files_inplace == True:
#     shutil.copyfile('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/train_augmented_offset.csv', '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/train.csv')
#     shutil.copyfile('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/dev_augmented_offset.csv', '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/dev.csv')
#     shutil.copyfile('/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/test_augmented_offset.csv', '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/test.csv')    
    
# create devolpment set
updated_lines_to_add_train_dev = updated_lines_to_add_train[:1000]
updated_lines_to_add_dev_dev = updated_lines_to_add_dev[:100]
updated_lines_to_add_test_dev = updated_lines_to_add_test[:100]

# Write the updated data to a new file
if make_state_level==True:
    save_dir_dev = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_dev'
    os.makedirs(save_dir_dev, exist_ok=True)
    write_csv_file(f'{save_dir_dev}/train_augmented_offset.csv', updated_lines_to_add_train_dev)
    write_csv_file(f'{save_dir_dev}/dev_augmented_offset.csv', updated_lines_to_add_dev_dev)
    write_csv_file(f'{save_dir_dev}/test_augmented_offset.csv', updated_lines_to_add_test_dev)
elif make_state_level==False:
    save_dir_dev = '/home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented_dev_regions'
    os.makedirs(save_dir_dev, exist_ok=True)
    write_csv_file(f'{save_dir_dev}/train_augmented_regions_offset.csv', updated_lines_to_add_train_dev)
    write_csv_file(f'{save_dir_dev}/dev_augmented_regions_offset.csv', updated_lines_to_add_dev_dev)
    write_csv_file(f'{save_dir_dev}/test_augmented_regions_offset.csv', updated_lines_to_add_test_dev)

# with open(f'{save_dir_dev}/train_augmented_offset.csv', 'w') as f:
#     for line in updated_lines_to_add_train_dev:
#         f.write(line)
# with open(f'{save_dir_dev}/dev_augmented_offset.csv', 'w') as f:
#     for line in updated_lines_to_add_dev_dev:
#         f.write(line)
# with open(f'{save_dir_dev}/test_augmented_offset.csv', 'w') as f:
#     for line in updated_lines_to_add_test_dev:
#         f.write(line)

155488


  0%|          | 0/155487 [00:00<?, ?it/s]

100%|██████████| 155487/155487 [00:00<00:00, 2136554.46it/s]


8185


100%|██████████| 8184/8184 [00:00<00:00, 2325307.14it/s]


5719


100%|██████████| 5718/5718 [00:00<00:00, 2377853.49it/s]


# Zip with pigz

In [16]:
# tar -cf - /home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented | pigz -1 > /home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/at_augmented.tar.gz
# do with os.system
os.system('tar -cf - /home/projects/vokquant/data/dicla/augmented/ | pigz -1 > /home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/at_augmented_mono.tar.gz')


tar: Removing leading `/' from member names


0

# Make Files Mono

In [12]:
import os
import torchaudio
import librosa

# BE CAREFUL: Librosa always outputs only one channel, while torchaudio outputs the number of channels in the audio file

# audio = '/home/projects/vokquant/data/dicla/augmented/TS_WA/80128_TS_WA_454.01.wav'
# audio_librosa, sr = librosa.load(audio, sr=None)
# print(audio_librosa.shape)
# torchaudio_1 = torchaudio.load(audio)
# print(torchaudio_1[0].shape)

folder = '/home/projects/vokquant/data/dicla/augmented/'
count = 0
for path, subdirs, files in os.walk(folder):
    for name in files:
        if name.endswith('.wav'):
            audio_file = os.path.join(path, name)
            audio, sr = torchaudio.load(audio_file)
            if sr != 16000:
                print(f"{audio_file} has sample rate {sr}")
                continue
            if len(audio.shape) > 1:
                # print(f"Converting {audio_file} to mono")
                audio = audio.mean(dim=0, keepdim=True)
                safe_path = os.path.join(path, name)
                # print(f"Saving to {safe_path}")
                torchaudio.save(safe_path, audio, sr)
                # count += 1
            else:
                continue
                # print(f"{audio_file} is already mono")

In [15]:
# get file size in GB for 'home/projects/vokquant/data/dicla/augmented/'
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

size = get_size('/home/projects/vokquant/data/dicla/augmented/')
print("size in GB: ", size / (1024**3))

size in GB:  10.543154884129763


# generate MP3

In [17]:
import os
# upload to cloud
# curl bashupload.com -T /home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/at_augmented.tar.gz
os.system('curl bashupload.com -T /home/projects/vokquant/accent-recog-slt2022/CommonAccent/data/at_augmented/at_augmented.tar.gz')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
 40 8068M    0     0   40 3264M      0  5425k  0:25:22  0:10:16  0:15:06 5034k

<html>
<head><title>500 Internal Server Error</title></head>
<body bgcolor="white">
<center><h1>500 Internal Server Error</h1></center>
<hr><center>nginx</center>
</body>
</html>


 40 8068M    0   186   40 3266M      0  5425k  0:25:22  0:10:16  0:15:06 5188k


0

In [None]:
# make to mp3

folder = '/home/projects/vokquant/data/dicla/augmented/'
count = 0
for path, subdirs, files in os.walk(folder):
    for name in files:
        if name.endswith('.wav'):
            audio_file = os.path.join(path, name)
            audio, sr = torchaudio.load(audio_file)
            # safe as mp3
            # convert
            mp3_file = audio_file.replace('.wav', '.mp3')
            new_safe_path = audio_file.replace('augmented', 'augmented_mp3')
            print(f"New safe path: {new_safe_path}")
            # torchaudio.save(mp3_file, audio, sr, format='mp3')
            

(30407,)
torch.Size([1, 30407])
