<a href="https://colab.research.google.com/github/aswit3/Dumb-People-Voice-Recovery/blob/master/Audio_Augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Noise Injection

In [0]:
import numpy as np
def noise_injection(data, noise_factor):
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    # Cast back to same data type
    augmented_data = augmented_data.astype(type(data[0]))
    return augmented_data

# Shifting Time


In [0]:
def shifting_time(data, sampling_rate, shift_max, shift_direction):
    shift = np.random.randint(sampling_rate * shift_max)
    if shift_direction == 'right':
        shift = -shift
    elif self.shift_direction == 'both':
        direction = np.random.randint(0, 2)
        if direction == 1:
            shift = -shift
    augmented_data = np.roll(data, shift)
    # Set to silence for heading/ tailing
    if shift > 0:
        augmented_data[:shift] = 0
    else:
        augmented_data[shift:] = 0
    return augmented_data

# Changing Pitch

In [0]:
import librosa
def change_pitch(data, sampling_rate, pitch_factor):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

# Changing Speed

In [0]:
import librosa
def change_speed(data, speed_factor):
    return librosa.effects.time_stretch(data, speed_factor)

# MultiProcessing

In [0]:
from multiprocessing import Pool, cpu_count

def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [0]:
import os

os.chdir("/content/drive/My Drive/just")

In [0]:
import pandas as pd

df_train = pd.read_csv("train.csv")
df_dev = pd.read_csv("dev.csv")
df_test = pd.read_csv("test.csv")

In [0]:
class AudioAugmentation:
    def read_audio_file(self, file_path):
        input_length = 16000
        data = librosa.core.load(file_path)[0]
        if len(data) > input_length:
            data = data[:input_length]
        else:
            data = np.pad(data, (0, max(0, input_length - len(data))), "constant")
        return data

    def add_noise(self, data):
        noise = np.random.randn(len(data))
        data_noise = data + 0.005 * noise
        return data_noise
    def shift(self, data):
        return np.roll(data, 1600)
    def stretch(self, data, rate=1):
        input_length = 16000
        data = librosa.effects.time_stretch(data, rate)
        if len(data) > input_length:
            data = data[:input_length]
        else:
            data = np.pad(data, (0, max(0, input_length - len(data))), "constant")
        return data

    def write_audio_file(self, file, data, sample_rate=16000):
        librosa.output.write_wav(file, data, sample_rate)

In [0]:
import librosa
import numpy as np

aa = AudioAugmentation()
tmp = pd.DataFrame()
def noise_augmentation_fn(wav_filename):
    filename = wav_filename.replace(".wav", "_noise.wav")
    data = aa.read_audio_file(wav_filename)
    data = aa.add_noise(data)
    aa.write_audio_file(filename, data)
    return filename

def helper_fn(df_split):
    global tmp
    tmp["wav_filename"] = df_split["wav_filename"].apply(noise_augmentation_fn)
    tmp["wav_filesize"] = tmp["wav_filename"].apply(os.path.getsize)
    tmp["transcript"] = df_split["transcript"]
    return tmp

In [0]:
tmp = pd.DataFrame()
df_train_noise = parallelize_dataframe(df_train, helper_fn)
tmp = pd.DataFrame()
df_dev_noise = parallelize_dataframe(df_dev, helper_fn)
tmp = pd.DataFrame()
df_test_noise = parallelize_dataframe(df_test, helper_fn)

In [0]:
df_train_noise["wav_filename"][0]

'/content/drive/My Drive/just/DeepSpeech-0.6.1/train/welcome-54_noise.wav'

# shifting

In [0]:
def shift_augmentation_fn(wav_filename):
    filename = wav_filename.replace(".wav", "_shift.wav")
    data = aa.read_audio_file(wav_filename)
    data = aa.add_noise(data)
    aa.write_audio_file(filename, data)
    return filename

def helper_fn(df_split):
    global tmp
    tmp["wav_filename"] = df_split["wav_filename"].apply(shift_augmentation_fn)
    tmp["wav_filesize"] = tmp["wav_filename"].apply(os.path.getsize)
    tmp["transcript"] = df_split["transcript"]
    return tmp

tmp = pd.DataFrame()
df_train_shift = parallelize_dataframe(df_train, helper_fn)
tmp = pd.DataFrame()
df_dev_shift = parallelize_dataframe(df_dev, helper_fn)
tmp = pd.DataFrame()
df_test_shift = parallelize_dataframe(df_test, helper_fn)

# stretch

In [0]:
def stretch_augmentation_fn(wav_filename):
    filename = wav_filename.replace(".wav", "_stretch.wav")
    data = aa.read_audio_file(wav_filename)
    data = aa.add_noise(data)
    aa.write_audio_file(filename, data)
    return filename

def helper_fn(df_split):
    global tmp
    tmp["wav_filename"] = df_split["wav_filename"].apply(stretch_augmentation_fn)
    tmp["wav_filesize"] = tmp["wav_filename"].apply(os.path.getsize)
    tmp["transcript"] = df_split["transcript"]
    return tmp

tmp = pd.DataFrame()
df_train_stretch = parallelize_dataframe(df_train, helper_fn)
tmp = pd.DataFrame()
df_dev_stretch = parallelize_dataframe(df_dev, helper_fn)
tmp = pd.DataFrame()
df_test_stretch = parallelize_dataframe(df_test, helper_fn)

# Combine all 

In [0]:
df_train_all = pd.concat([df_train, df_train_noise, df_train_shift, df_train_stretch], axis=0)

In [0]:
df_train_all.shape

(2324, 3)

In [0]:
df_dev_all = pd.concat([df_dev, df_dev_noise, df_dev_shift, df_dev_stretch], axis=0)
df_test_all = pd.concat([df_test, df_test_noise, df_test_shift, df_test_stretch], axis=0)

In [0]:
df_train_all.to_csv("train_all.csv", index=False)

In [0]:
df_dev_all.to_csv("dev_all.csv", index=False)

In [0]:
df_test_all.to_csv("test_all.csv", index=False)

In [0]:
df_dev_all[df_dev_all["wav_filesize"] < 400]

Unnamed: 0,wav_filename,wav_filesize,transcript


In [0]:
df_train_all["wav_filesize"]

0      191708
1      111712
2       73716
3      125712
4      117712
        ...  
576    128058
577    128058
578    128058
579    128058
580    128058
Name: wav_filesize, Length: 2324, dtype: int64

In [0]:
import librosa
import IPython.display as ipd

In [0]:
def load_audio_file(file_path):
    input_length = 16000
    data = librosa.core.load(file_path)[0] #, sr=16000
    if len(data)>input_length:
        data = data[:input_length]
    else:
        data = np.pad(data, (0, max(0, input_length - len(data))), "constant")
    return data

In [0]:
data = load_audio_file("../input/train/audio/off/1df483c0_nohash_0.wav")
ipd.Audio(data, rate=16000)