In [None]:
import os 
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt 
import pylab
from PIL import Image

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class Augment:
    def __init__(self, config):
        self.mfcc_data = []
        self.bands = 128
        self.frames = 128
        self.counts = {
            "english": 0,
            "hindi": 0,
            "mandarin": 0,
            "tagalog": 0,
            "other": 0
        }
        self.samples = 1000

        self.pitch_shift_count = config.get("pitch_shift_count",0) #should be even
        self.gaussian_noise_count = config.get("gaussian_noise_count",0)
        self.gaussian_noise_stdev = config.get("gaussian_noise_stdev",1) #1 dB
        self.save_npys = config.get("save_npys",True)
        self.save_jpgs = config.get("save_jpgs",True)


    def windows(self,data,window_size):
        start = 0 
        yield start, start + window_size
        start += (window_size / 2)

    def pitch_shifter(self, mfcc, sr):
        boundary = int(self.pitch_shift_count/2)
        pitch_shifted_mfccs = []
        for i in range(-boundary, boundary+1):
            if i != 0:
                pitch_shifted_mfccs.append(librosa.effects.pitch_shift(mfcc, sr, i))
        return pitch_shifted_mfccs

    def gaussian_noiser(self, mfcc):
        np.random.seed(1)
        gaussian_noised_mfccs = []
        for i in range(0, self.gaussian_noise_count):
            noise = np.random.normal(0,gaussian_noise_stdev,mfcc.shape)
            gaussian_noised_mfccs.append(noise + mfcc)
        return gaussian_noised_mfccs

    def get_MFCC(self,speech):
        mfcc = librosa.feature.melspectrogram(speech) 
        logspec = librosa.amplitude_to_db(mfcc)
        return logspec

    def resize_mfcc(self,mfcc):
        resized_mfcc = librosa.util.fix_length(mfcc, self.bands, axis=1)
        resized_mfcc = np.vstack((np.zeros((0, self.bands)), resized_mfcc))
        return resized_mfcc
        
    def save_data(self,data,abs_dir,sub_dir, name_only):
        elems = sub_dir.split('/')
        new_sub_dir = os.path.join("data_mfcc_npy", elems[1])

        if elems[1] in self.counts.keys():
            num = self.counts[elems[1]]
            print(num)
            self.counts[elems[1]] += 1
            if num >= self.samples:
                return True
        else:
            num = self.counts["other"]
            self.counts["other"] += 1 
            if num >= self.samples:
                return True
                
        if self.save_npys:
            new_filename_npy = name_only + "_" + str(num) + '.npy'
            path_npy = [abs_dir,new_sub_dir,new_filename_npy]
            save_path_npy = os.path.join(*path_npy)
            np.save(save_path_npy,data)

        if self.save_jpgs:
            new_filename_jpg = name_only + "_" + str(num) + '.jpg'
            path_jpg = [abs_dir,new_sub_dir,new_filename_jpg]
            save_path_jpg = os.path.join(*path_jpg)
            pylab.axis('off')
            pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[])
            librosa.display.specshow(data)
            pylab.savefig(save_path_jpg, bbox_inches=None, pad_inches=0)
            pylab.close()
        return True 
    
    def uniform_clip_split(self, abs_dir, sub_dir, file_name, duration):
        path = [abs_dir,sub_dir,file_name] 
        file_path = os.path.join(*path)
        name_only = file_name.split(".mp3")[0]
        speech,s = librosa.load(file_path, sr=22050)
        num_frames = int(self.duration*s)
        for i in range(0, int(len(speech)/num_frames)):
            start = i*num_frames
            end = start + num_frames 
            if (len(speech[start:end]) == num_frames):
                clip = speech[start:end]

                # generate and save unaltered MFCC
                raw_mfcc = self.get_MFCC(clip)
                raw_mfcc = self.resize_mfcc(raw_mfcc)
                self.save_data(raw_mfcc,abs_dir,sub_dir,name_only)

                # generate and save noisy, unshifted MFCCs
                noisy_mfccs = self.gaussian_noiser(raw_mfcc)
                for i, mfcc in enumerate(noisy_mfccs):
                    self.save_data(mfcc,abs_dir,sub_dir,(name_only+"_noisy%d"%(i)))

                # generate and save pitch-shifted MFCCs - both noisy and non-noisy
                shifted_mfccs = self.pitch_shifter(raw_mfcc,sr)
                lowest_shift = -1*int(self.pitch_shift_count/2)
                for i, shifted_mfcc in enumerate(shifted_mfccs):
                    shift = lowest_shift + i
                    shifted_name = name_only+("_shifted%d"%shift)
                    self.save_data(shifted_mfcc,abs_dir,sub_dir,shifted_name)

                    noisy_shifted_mfccs = self.gaussian_noiser(shifted_mfcc)
                    for j, noisy_shifted_mfcc in noisy_shifted_mfccs:
                        noisy_shifted_name = shifted_name + ("_noisy%d"%j)
                        self.save_data(noisy_shifted_mfcc, abs_dir, sub_dir, shifted_name)


        def create_mfccs(self, sub_dir):
            abs_dir = os.getcwd()
            for file_name in os.listdir(os.path.join(abs_dir,sub_dir)):
                self.uniform_clip_split(abs_dir,sub_dir,file_name)

            return True

In [None]:
config = {
    "pitch_shift_count": 4,
    "gaussian_noise_count": 2,
    "gaussian_noise_stdev": 1,
    "save_jpgs": False,
    "save_npys": True
}

data = Augment(config)

print("Processing Hindi...")
data.create_mfccs("data_specific/hindi") 
print("Processing Mandarin...")
data.create_mfccs("data_specific/mandarin") 
print("Processing Tagalog...")
data.create_mfccs("data_specific/tagalog") 
print("Processing English...")
data.create_mfccs("data_specific/english") 

UnboundLocalError: ignored