# One minute files feature extraction

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import os
import numpy as np
import collections

import scipy.io.wavfile
from scipy.io import loadmat
import collections

import sys, os
sys.path.append(os.path.expanduser('~/projects/engaged_hackathon/'))
from engaged.features import features as engaged_features
from engaged.features import frequency

In [2]:
# getting a list of all the files
base_path = '/home/michael/projects/engaged_hackathon_data/raw_data/one_minute_files'
files = os.listdir(base_path + '/detection_challenge')
files = [xx.split('.')[0] for xx in files if 'sampled' not in xx]

spectrogram_parameters = {
    'nfft': 1024,
    'window_width': 0.03,
    'overlap': 0.01,
    }

print len(files)

362


## Creating and saving spectrograms

For each input file, compute and save a spectrogram

In [3]:
spec_path = '/home/michael/projects/engaged_hackathon_data/detection/spectrograms/'
import skimage 


for count, fname in enumerate(files):
        
    # load in wav and convert to spectrogram
    sr, wav = scipy.io.wavfile.read(base_path + '/25_Random/' + fname + '.wav')  
    spec, spec_sampl_rate = frequency.spectrogram(wav, sr, **spectrogram_parameters)
    
    # do some smoorthing and denoising  
    spec = skimage.filters.gaussian_filter(spec, 1.0)
    spec -= np.median(spec, axis=1)[:, None]
    spec[spec<0] = 0
    
    # save to disk
    savepath = spec_path + fname + '_spec.mat'
    D = dict(spectrogram=spec, sample_rate=spec_sampl_rate)
    scipy.io.savemat(savepath, D)
    
    for height in [75, 128, 256]:
        
        # do a reduced size spectrogram
        factor = (float(height)/float(spec.shape[0]))
        new_width = int(factor * float(spec.shape[1]))
        output_shape = (height, new_width)
        small_spec = skimage.transform.resize(spec, output_shape)

        # save to disk
        savepath = spec_path + fname + '_spec_' + str(height) + '.mat'
        D = dict(spectrogram=small_spec, sample_rate=spec_sampl_rate * factor)
        D['wav_sample_rate'] = sr
        D['wav_shape'] = wav.shape
        scipy.io.savemat(savepath, D)
        
    if count % 10 == 0:
        print count,
    

0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360


## Compare loading times vs generation times

In [1]:
from time import time

number = 10

tic = time()
for fname in files[:number]:
    sr, wav = scipy.io.wavfile.read(base_path + '/25_Random/' + fname + '.wav')  
    spec, spec_sampl_rate = frequency.spectrogram(wav, sr, **spectrogram_parameters)

print "Generation takes %fs" % ((time() - tic) / number)

tic = time()
for fname in files[:number]:
    spec = scipy.io.loadmat(spec_path + fname + '_spec.mat')

print "Loading big takes %fs" %  ((time() - tic) / number)

tic = time()
for fname in files[:number]:
    spec = scipy.io.loadmat(spec_path + fname + '_smallspec.mat')
    
print "Loading small takes %fs" %  ((time() - tic) / number)

print wav.shape

NameError: name 'files' is not defined

## Choosing locations and saving to disk

For each file, choose a list of where to extract features from

In [16]:
# for each file, compute features for each second and assign to biotic/not_biotic lists
# we will worry about anthropogenic later
# max_files = 50

# maximum num slices of each class to take from each file
max_from_each_file = 2000
# files_to_use = files[:max_files]

features_savepath = '/home/michael/projects/engaged_hackathon_data/detection/biotic_anthrop/'


def choose_locations(idxs, maximum, balance=False):
    """
    Given a binary array, the function returns a list of positive and negative locations sampled
    at random from the list.
    Returns 'maximum' locations unless there there are fewer than maximum locations in
    idxs, in which case all are returned
    If balance is true then the classes are balanced to the smaller class size to
    ensure an equal number of each
    """
    false_idxs = np.where(idxs==0)[0]
    true_idxs = np.where(idxs==1)[0]
    
    if false_idxs.shape[0] > maximum and maximum is not None:
        false_idxs = np.random.choice(false_idxs, maximum, replace=False)
        
    if true_idxs.shape[0] > maximum and maximum is not None:
        true_idxs = np.random.choice(true_idxs, maximum, replace=False)
        
    if false_idxs.shape[0] != true_idxs.shape[0] and balance:
        raise Exception("Not implemented!")
        
    return true_idxs, false_idxs
    

for count, fname in enumerate(files):
        
    # load in ground truth - these give positions in terms of the wav file...
    gt = loadmat(base_path + '/detection_challenge/' + fname + '.mat')
      
    # sample positive and negative locations in the file for each sound type
    for soundtype in ['biotic', 'anthropogenic']:
        
        true_idxs, false_idxs = choose_locations(gt[soundtype][0], max_from_each_file)
        idxs = np.hstack([true_idxs, false_idxs])
        labels = np.hstack([np.ones(true_idxs.shape), np.zeros(false_idxs.shape)])
        
        D = dict(true_idxs=true_idxs, false_idxs=false_idxs, 
                 idxs=idxs, labels=labels, sample_rate=gt['sample_rate'])
        savepath = base_path + '/detection_challenge/' + fname + \
            '_sampled_' + soundtype + '.mat'
        scipy.io.savemat(savepath, D)

    if count % 20 == 0:
        print count,

0 20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360


In [62]:
def extract_1d_patches(array, locations, hww):
    """
    Extract vertical patches from the array, at the locations given.
    Each slice has a half window width hww
    
    Returns an array of shape:
    (len(locations), array.shape[0], hww*2+1)
    """
    # pad the array to account for overspill
    offset_idxs_np = np.array(locations).ravel() + hww
    extra1 = np.tile(array[:, 0], (hww, 1)).T
    extra2 = np.tile(array[:, -1], (hww, 1)).T
    a_temp = np.hstack((extra1, array, extra2))
    
    # set up the array of index locations to extract from
    idxs = [offset_idxs_np]
    for offset in range(1, hww+1):
        idxs.insert(0, offset_idxs_np-offset)
        idxs.append(offset_idxs_np+offset)
    new_idx = np.vstack(idxs).T.ravel()
    
    # extract the patches and do the appropriate reshapgin
    
    new_shape = (array.shape[0], offset_idxs_np.shape[0], hww*2 + 1)
    to_return = a_temp[:, new_idx].reshape(new_shape).transpose((1, 0, 2))
    return to_return

In [None]:
# Extracting patches from each spectrogram


In [2]:
# Combining all strips into training and test sets...
# Then each running script can just load in a set of train/test patches

# Do train/test split and combine training data
from sklearn.cross_validation import train_test_split

train_files, test_files = train_test_split(
    range(len(files_to_use)), random_state=0, train_size=0.1, test_size=0.05)

tX, tY = zip(*XY)

data = {}
data['X_train'] = np.vstack([tX[idx] for idx in train_files])
data['y_train'] = np.hstack([tY[idx] for idx in train_files]).astype(np.int32)
data['X_test'] = np.vstack([tX[idx] for idx in test_files])
data['y_test'] = np.hstack([tY[idx] for idx in test_files]).astype(np.int32)

for key in ['X_train', 'X_test']:
    tshape = data[key].shape
    data[key] = data[key].reshape((tshape[0], -1, tshape[1], tshape[2]))
    data[key] = data[key].astype(np.float32)

def balance_classes(X, Y):
    positives = np.where(Y == 0)[0]
    negatives = np.where(Y == 1)[0]
    max_examples = min(len(positives), len(negatives))
    
    if len(positives) > max_examples:
        positives = np.random.choice(positives, max_examples, replace=False)
    if len(negatives) > max_examples:
        negatives = np.random.choice(negatives, max_examples, replace=False)
        
    X = np.vstack((X[negatives, :], X[positives, :]))
    new_Y = np.hstack((Y[negatives], Y[positives])) 
    
    shuffle_idxs = np.random.permutation(X.shape[0])
    
    return X[shuffle_idxs, :], new_Y[shuffle_idxs]
    
# balance the classes...
for key in ['train', 'test']:
    data['X_' + key], data['y_' + key] = balance_classes(data['X_' + key], data['y_' + key])
    
for key, val in data.iteritems():
    print key, val.shape, val.dtype
    print val.min(), val.max(), val.sum()/float(val.shape[0])

for idx in range(10):
    plt.subplot(3, 4, idx+1)
    plt.imshow(data['X_train'][idx, 0, :, :])


NameError: name 'files_to_use' is not defined