In [1]:
from utils import *
import array 

from pydub import AudioSegment
import numba 
%matplotlib inline

In [2]:
all_labels = [x[0].split('/')[-1] for x in os.walk("data/train/audio/")]
 


exclusions = ["","_background_noise_"]
POSSIBLE_LABELS = [item for item in all_labels if item not in exclusions]

In [3]:
# POSSIBLE_LABELS = 'yes no up down left right on off stop go silence unknown'.split()
id2name = {i: name for i, name in enumerate(POSSIBLE_LABELS)}
name2id = {name: i for i, name in id2name.items()}
len(id2name)

33

In [4]:
all_labels

['',
 'silence',
 'left',
 'eight',
 'silence_many',
 'no',
 'tree',
 'nine',
 'bed',
 'dog',
 '_background_noise_',
 'house',
 'cat',
 'bird',
 'four',
 'zero',
 'on',
 'right',
 'sheila',
 'train',
 'six',
 'seven',
 'down',
 'one',
 'go',
 'happy',
 'two',
 'yes',
 'up',
 'three',
 'five',
 'marvin',
 'stop',
 'wow',
 'off']

In [5]:
def load_data(data_dir):
    np.random.seed = 1
    
    """ Return 2 lists of tuples:
    [(class_id, user_id, path), ...] for train
    [(class_id, user_id, path), ...] for validation
    """
    # Just a simple regexp for paths with three groups:
    # prefix, label, user_id
#     pattern = re.compile("(.+\/)?(\w+)\/([^_]+)_.+wav")
    pattern  =  re.compile("(.+[\/\\\\])?(\w+)[\/\\\\]([^_]+)_.+wav")
    all_files = glob(os.path.join(data_dir, 'train/audio/*/*wav'))

    with open(os.path.join(data_dir, 'train/validation_list.txt'), 'r') as fin:
        validation_files = fin.readlines()
        
    valset = set()
    for entry in validation_files:
        r = re.match(pattern, entry)
        if r:
            valset.add(r.group(3))
    
    possible = set(POSSIBLE_LABELS)
    
    train, val, silent, unknown = [], [],[],[]
    
    for entry in all_files:
        r = re.match(pattern, entry)
        if r:
            label, uid = r.group(2), r.group(3)
            
            if label == '_background_noise_': #we've already split up noise files into 1 seg chunks under 'silence' folder
                continue
                
#             if label not in possible:
#                 label = 'unknown'

            label_id = name2id[label]
            sample = (label, label_id, uid, entry)
            
            if label == "unknown":
                unknown.append(sample)
            elif label == "silence":
                silent.append(sample)
                
            elif uid in valset:    
                val.append(sample)
            else:
                train.append(sample)

    print('There are {} train and {} val samples'.format(len(train), len(val)))
    
    columns_list = ['label', 'label_id', 'user_id', 'wav_file']
    

    train_df = pd.DataFrame(train, columns = columns_list)
    valid_df = pd.DataFrame(val, columns = columns_list)
    silent_df = pd.DataFrame(silent, columns = columns_list)
    unknown_df = pd.DataFrame(unknown, columns = columns_list)
    
    return train_df, valid_df, unknown_df, silent_df

In [6]:
train_df, valid_df, unknown_df, silent_df = load_data('./data/')

There are 58321 train and 6798 val samples


In [7]:
train_df.head()

Unnamed: 0,label,label_id,user_id,wav_file
0,left,1,cb8f8307,./data/train/audio/left/cb8f8307_nohash_1.wav
1,left,1,b7a0754f,./data/train/audio/left/b7a0754f_nohash_2.wav
2,left,1,0132a06d,./data/train/audio/left/0132a06d_nohash_3.wav
3,left,1,f92e49f3,./data/train/audio/left/f92e49f3_nohash_4.wav
4,left,1,88053e92,./data/train/audio/left/88053e92_nohash_1.wav


### create wav.scp

In [8]:
def create_wav_scp(df,path):
    with open(path+"wav.scp","w") as f:
        for i, row in df.iterrows(): #row.label+"_"
            f.write("{} sox -t wav {} -r 16k -b 16 -t wav - remix - |\n".format( os.path.basename(row.wav_file)[:-4].replace("_","-"), 
                                                                              os.path.abspath(row.wav_file)))


In [9]:
create_wav_scp(train_df,"data/kaldi/train/")

### create segments 

In [52]:
import scipy
x = [0.1,0.2,0.3,0.4]
s = 0
for i in x:
    print scipy.special.logit(i)
    s += scipy.special.logit(i)

-2.19722457734
-1.38629436112
-0.847297860387
-0.405465108108


In [53]:
import subprocess
import re

In [63]:
def create_segments(df,path):
    with open(path+"segments","w") as f:
        for i, row in df.iterrows():
            process = subprocess.Popen(['ffmpeg',  '-i', row.wav_file], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
            stdout, stderr = process.communicate()
            matches = re.search(r"Duration:\s{1}(?P<hours>\d+?):(?P<minutes>\d+?):(?P<seconds>\d+\.\d+?),", stdout, re.DOTALL).groupdict()
            f.write("segm_{} {} 0 {}\n".format(os.path.basename(row.wav_file)[:-4],
                                                  os.path.basename(row.wav_file)[:-4], 
                                                                              float(matches['seconds'])))


In [64]:
create_segments(train_df,"data/kaldi/train/")

### create utt2spk

In [66]:
def create_utt2spk(df,path):
    with open(path+"utt2spk","w") as f:
        for i, row in df.iterrows():
            f.write("segm_{} {}\n".format(os.path.basename(row.wav_file)[:-4],row.label+'_'+row.user_id))


In [67]:
create_utt2spk(train_df,"data/kaldi/train/")

### create text file

In [70]:
def create_text(df,path):
    with open(path+"text","w") as f:
        for i, row in df.iterrows():
            f.write("segm_{} {}\n".format(os.path.basename(row.wav_file)[:-4],row.label))


In [71]:
create_text(train_df,"data/kaldi/train/")

#### create validation dataframe

In [8]:
#augment validation set with silence and unknown files, made with step=250 when generating silence files
extra_data_size = int(valid_df.shape[0]*0.1)

# unknown_val = unknown_df.sample(extra_data_size,random_state=1)
# unknown_df = unknown_df[~unknown_df.index.isin(unknown_val.index.values)]

silent_val = silent_df.sample(extra_data_size,random_state=1)
silent_df = silent_df[~silent_df.index.isin(silent_val.index.values)]


valid_df = pd.concat([valid_df,silent_val],axis=0)

In [9]:
# silence_files = train_df[train_df.label == 'silence']
# train_df      = train_df[train_df.label != 'silence']

In [10]:
%%time

silence_files_AS = [AudioSegment.from_wav(x) for x in silent_df.wav_file.values]

filler = AudioSegment.silent(duration=1000, frame_rate = 16000)


CPU times: user 216 ms, sys: 96 ms, total: 312 ms
Wall time: 2.4 s


In [11]:
train_df.label.value_counts()

one       2140
two       2137
stop      2134
nine      2134
yes       2116
zero      2116
five      2115
up        2115
seven     2114
go        2112
right     2111
on        2110
eight     2109
three     2108
six       2107
left      2106
no        2105
off       2101
down      2095
four      2092
marvin    1586
wow       1579
house     1577
dog       1576
bird      1569
tree      1567
cat       1565
sheila    1558
happy     1553
bed       1516
Name: label, dtype: int64

In [12]:
valid_df.label.value_counts()

silence    679
four       280
no         270
down       264
seven      263
six        262
yes        261
go         260
up         260
zero       260
on         257
right      256
off        256
three      248
left       247
stop       246
eight      243
five       242
two        236
one        230
nine       230
bed        197
happy      189
sheila     176
house      173
dog        170
cat        168
wow        166
tree       166
bird       162
marvin     160
Name: label, dtype: int64

### Feature extraction, augmentation, caching

In [13]:

def augment_wav(wav,pval=0.5):
    sample_rate = 16000
    L = 1000 #16000  # 1 sec
    
    #adjust speed, with 50% chance
#     wav = speed_change(wav,1.+random.uniform(-1, 1)*0.05) if np.random.random() < pval else wav
    
    
    #adjust volume
    db_adjustment = random.uniform(-1, 1)*10
    wav = wav + db_adjustment if np.random.random() < pval else wav
     
        
    #fill to 1 second
    wav = fill_to_1sec(wav)        
        
    #shift the audio by 10 ms
    shift_length = 100
    if np.random.random() < 0.5: #shift to left
        wav = wav[:L-shift_length]+ AudioSegment.silent(shift_length,frame_rate=sample_rate)
    else: #shift to right
        wav = AudioSegment.silent(shift_length,frame_rate=sample_rate) + wav[shift_length:]
        
        
        
    #blend original file with background noise     
    if np.random.random() < pval:
        noise = random.choice(silence_files_AS)
        db_delta = (wav.dBFS - noise.dBFS) -10.

        if db_delta< 0: #reduce intensity of loud background; if it's too silent, leave it be
            noise = noise  + db_delta
        wav = wav.overlay(noise)
 
    return wav



def process_wav_file(record, reshape=False, augment=False,pval=0.5 ,output_format='logmel',n_mels=128 ):
    
    if type(record) == str: # test files
        fname = record
        label = "test"
    else:    
        fname  = record.wav_file
        label = record.label

        
        
        
        
    if "raw_AS_wav" in record: 
        wav = record.raw_AS_wav
    else:
        wav = AudioSegment.from_wav(fname.replace("\\","/"))
        
        
    
    if (not label in ["silence"]) and augment: #no augmentation for sample files 
        wav = augment_wav(wav,pval)

    else: #make sure segment is 1 second
        wav = fill_to_1sec(wav)

        
    samples = AS_to_raw(wav)
    
    
    
    if output_format == "logmel":
        output = log_mel(samples,reshape=reshape,n_mels=n_mels)
        
    elif output_format == "mfcc":
        log_S = log_mel(samples,reshape=False,n_mels=n_mels)
        mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=40) #hirese mfcc
        delta1 = librosa.feature.delta(mfcc, order=1)#hirese mfcc
        delta2 = librosa.feature.delta(mfcc, order=2)

        output = np.stack([mfcc,delta1,delta2])
        
    elif  output_format == "cqt":   
        output = librosa.cqt(samples, sr=16000)
    else:
        output = samples
    
    
    return output


#### load existing datasets

In [5]:
# %%time 

# train_df = pickle.load( open("cache/train_df_waug.pik","rb"))
# valid_df = pickle.load( open("cache/valid_df.pik","rb"))
# silent_df = pickle.load(open("cache/silent_df.pik","rb"))
# unknown_df = pickle.load(open("cache/unknown_df_waug.pik","rb"))
# test_df =  pickle.load(open("cache/test_df.pik","rb"))

CPU times: user 5.16 s, sys: 6.71 s, total: 11.9 s
Wall time: 1min 4s


In [4]:
#ignore augmentations 
# train_df = train_df.iloc[:train_df.shape[0]/5]
# unknown_df = unknown_df.iloc[:unknown_df.shape[0]/5]

In [66]:
pickle.dump( train_df,open("cache/train_df.pik","wb"),protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(valid_df, open("cache/valid_df.pik","wb"),protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(silent_df, open("cache/silent_df.pik","wb"),protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(unknown_df, open("cache/unknown_df.pik","wb"),protocol=pickle.HIGHEST_PROTOCOL)

### extract logmel 

In [14]:
%time valid_df["raw"]  = valid_df.wav_file.apply(lambda x : process_wav_file(x,augment=False,n_mels=256))
%time train_df["raw"]  = train_df.wav_file.apply(lambda x : process_wav_file(x,augment=False,n_mels=256))


CPU times: user 2min 43s, sys: 14 s, total: 2min 57s
Wall time: 1min 33s
CPU times: user 17min 56s, sys: 1min 30s, total: 19min 27s
Wall time: 10min 11s


In [15]:
pickle.dump(train_df,open("cache/train_df_all_labels.pik","wb"),protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(valid_df,open("cache/valid_df_all_labels.pik","wb"),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(silent_df, open("cache/silent_df_all_labels.pik","wb"),protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(unknown_df, open("cache/unknown_df.pik","wb"),protocol=pickle.HIGHEST_PROTOCOL)

#### precompute augmentations 


In [34]:
#precomputing augs for faster neural net training
def precompute_augmentations(df,num_repeats=4):
    
    df_aug= pd.concat([df]*num_repeats)
    df_aug['raw'] = df_aug.wav_file.apply(lambda x :  process_wav_file(x,augment=True,n_mels=128))
    df = pd.concat([df, df_aug])
    
    return df 



In [None]:
%time train_df = precompute_augmentations(train_df)

CPU times: user 50min 24s, sys: 54.1 s, total: 51min 18s
Wall time: 26min 1s


In [None]:
pickle.dump( train_df,open("cache/train_df_all_labels.pik","wb"),protocol=pickle.HIGHEST_PROTOCOL)
