# Extracting training and testing patches from the spectrograms

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import os
import numpy as np
import collections

import scipy.io.wavfile
from scipy.io import loadmat
from sklearn.cross_validation import train_test_split
import collections

import sys, os
sys.path.append(os.path.expanduser('~/projects/engaged_hackathon/'))
from engaged.features import frequency
from scipy.ndimage.interpolation import zoom
from time import time
import cPickle as pickle


In [2]:
# getting a list of all the files
base_path = '/home/michael/projects/engaged_hackathon_data/raw_data/one_minute_files'
files = os.listdir(base_path + '/detection_challenge')
files = [xx.split('.')[0] for xx in files if 'sampled' not in xx]
print len(files)

362


In [3]:
# Doing the full train/test split
savedir = '/home/michael/projects/engaged_hackathon_data/detection/train_test_patches/'

train_files, test_files = train_test_split(
    range(len(files)), random_state=0, train_size=0.7, test_size=0.3)

# Also save which are the training and test files
train_filenames = [files[idx] for idx in train_files]
test_filenames = [files[idx] for idx in test_files]

D = dict(train_files=train_filenames, test_files=test_filenames, 
         train_file_idxs=train_files, test_file_idxs=test_files)
print D['train_files'][:2]
scipy.io.savemat(savedir + 'split.mat', D)

['W112NN-13548_20130708_2146', 'W84LA-013548_20130627_1125']


## For each file, extract 1D patches at random locations

Ensure that the same number of positive and negative patches are extracted from each file!

In [4]:
# setting options
should_I_balance = False
max_from_each_file = 2000
savename = 'unbalanced_256'
spec_name = '_spec_256.mat'
should_I_subsample = True
max_samples = {'train': 100000, 'test': 30000}
# del tX, tY, XY

In [5]:
np.random.seed(10)

def choose_locations(idxs, maximum, random_sample=True, balance=False):
    """
    Given a binary array, the function returns a list of positive and negative locations sampled
    at random from the list.
    Returns 'maximum' locations unless there there are fewer than maximum locations in
    idxs, in which case all are returned
    If balance is true then the classes are balanced to the smaller class size to
    ensure an equal number of each
    """
    
    if random_sample:
        idxs = idxs.astype(int)
        
        to_ignore = np.random.choice(
            idxs.shape[0], idxs.shape[0]-maximum, replace=False)
        idxs[to_ignore] = 2
        
    false_idxs = np.where(idxs==0)[0]
    true_idxs = np.where(idxs==1)[0]
        
    if false_idxs.shape[0] > maximum and maximum is not None and balance:
        print "Sholt"
        false_idxs = np.random.choice(false_idxs, maximum, replace=False)
        
    if true_idxs.shape[0] > maximum and maximum is not None and balance:
        true_idxs = np.random.choice(true_idxs, maximum, replace=False)
        
    if false_idxs.shape[0] != true_idxs.shape[0] and balance:
        raise Exception("Not implemented!")
        
    return true_idxs, false_idxs
    

def extract_1d_patches(array, locations, hww):
    """
    Extract vertical patches from the array, at the locations given.
    Each slice has a half window width hww
    
    Returns an array of shape:
    (len(locations), array.shape[0], hww*2+1)
    """
    # pad the array to account for overspill
    offset_idxs_np = np.array(locations) + hww
    extra1 = np.tile(array[:, 0], (hww, 1)).T
    extra2 = np.tile(array[:, -1], (hww, 1)).T
    a_temp = np.hstack((extra1, array, extra2))
    
    # set up the array of index locations to extract from
    idxs = [offset_idxs_np]
    for offset in range(1, hww+1):
        idxs.insert(0, offset_idxs_np-offset)
        idxs.append(offset_idxs_np+offset)
    new_idx = np.vstack(idxs).T.ravel()
    
    # extract the patches and do the appropriate reshapgin
    new_shape = (array.shape[0], offset_idxs_np.shape[0], hww*2 + 1)
    to_return = a_temp[:, new_idx].reshape(new_shape).transpose((1, 0, 2))
    return to_return

spec_path = '/home/michael/projects/engaged_hackathon_data/detection/spectrograms/'


def process_file(inputs):
    
    count, fname = inputs
    
    # load in wav and convert to spectrogram
    spec = scipy.io.loadmat(spec_path + fname + spec_name)['spectrogram']
        
    # load in ground truth
    gt = loadmat(base_path + '/detection_challenge/' + fname + '.mat')
    
    # convert the labels to the sampling rate of the spectrogram
    zoom_factor = float(spec.shape[1]) / float(gt['biotic'][0].shape[0])
    gt_biotic = zoom(gt['biotic'][0], zoom_factor).astype(bool)
    gt_anthrop = zoom(gt['anthropogenic'][0], zoom_factor).astype(bool)
    
    # choosing where to extract from..
    tic = time()    
    true_idxs, false_idxs = choose_locations(gt_biotic, max_from_each_file)
    idxs = np.hstack([true_idxs, false_idxs])
    labels = np.hstack(
        [np.ones(true_idxs.shape), np.zeros(false_idxs.shape)]).astype(np.int32)
    
    # let's group the X here and create a suitable Y vector...
    patches = extract_1d_patches(spec, idxs, hww=9).astype(np.float32)

    if count % 5 == 0:
        print count,
    
    return patches, labels

print len(files)


362


In [6]:
XY = map(process_file, enumerate(train_filenames))
tX, tY = zip(*XY)

del XY
data = {}
data['X_train'] = np.vstack(tX)
data['y_train'] = np.hstack(tY).astype(np.int32).ravel()
del tX, tY



0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 205 210 215 220 225 230 235 240 245 250




In [16]:
print data['X_train'].shape
print data['y_train'].shape

if should_I_balance:
    # balance the classes...
    for key in ['train']:
        data['X_' + key], data['y_' + key] = balance_classes(data['X_' + key], data['y_' + key])

if should_I_subsample:
    for key in ['train']:
        data['X_' + key], data['y_' + key] = subsample_data(data['X_' + key], data['y_' + key], max_samples[key])

print data['X_train'].shape
print data['y_train'].shape


(506000, 1, 256, 19)
(506000,)
(100000, 1, 256, 19)
(100000,)


In [17]:
print max_samples['train']

100000


In [18]:
XY = map(process_file, enumerate(test_filenames))
tX, tY = zip(*XY)
data['X_test'] = np.vstack(tX)
data['y_test'] = np.hstack(tY).astype(np.int32).ravel()
del tX, tY


if should_I_balance:
    # balance the classes...
    for key in ['test']:
        data['X_' + key], data['y_' + key] = balance_classes(data['X_' + key], data['y_' + key])

if should_I_subsample:
    for key in ['test']:
        data['X_' + key], data['y_' + key] = subsample_data(data['X_' + key], data['y_' + key], max_samples[key])
     

0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105


In [7]:
# print np.sum([xx.shape[0] for xx in tX])
# print np.sum([xx for xx in tY])
# del XY

# Train/test split and combining data

The split is done at the file level. In the future, I should really do this at location level.

In [13]:
def balance_classes(X, Y):
    """
    Returns X, Y, where there are equal numbers of Y==0 as Y==1.
    """
    positives = np.where(Y == 0)[0]
    negatives = np.where(Y == 1)[0]
    max_examples = min(len(positives), len(negatives))
    
    if len(positives) > max_examples:
        positives = np.random.choice(positives, max_examples, replace=False)
    if len(negatives) > max_examples:
        negatives = np.random.choice(negatives, max_examples, replace=False)
        
    new_X = np.vstack((X[negatives, :], X[positives, :]))
    new_Y = np.hstack((Y[negatives], Y[positives])) 
    
    # return the output in a random order
    shuffle_idxs = np.random.permutation(new_X.shape[0])
    
    return new_X[shuffle_idxs, :], new_Y[shuffle_idxs]

def subsample_data(X, Y, num):
    if num > X.shape[0]:
        return X, Y
    else:
        to_use = np.random.choice(X.shape[0], num, replace=False)
        return X[to_use, :], Y[to_use]

# Extracting the data for each side of the split
# data = {}
# data['X_train'] = np.vstack([tX[idx] for idx in train_files])
# data['y_train'] = np.hstack([tY[idx] for idx in train_files]).astype(np.int32).ravel()
# data['X_test'] = np.vstack([tX[idx] for idx in test_files])
# data['y_test'] = np.hstack([tY[idx] for idx in test_files]).astype(np.int32).ravel()

# Ensuring data is the correct shape
for key in ['X_train', 'X_test']:
    tshape = data[key].shape
    data[key] = data[key].reshape((tshape[0], -1, tshape[1], tshape[2]))
    data[key] = data[key].astype(np.float32)

KeyError: 'X_test'

In [19]:
   
# Print details to screen
for key, val in data.iteritems():
    print key, val.shape, val.dtype, val.mean()

X_test (30000, 256, 19) float32 0.158177
X_train (100000, 1, 256, 19) float32 0.164131
y_train (100000,) int32 0.12489
y_test (30000,) int32 0.102133333333


In [20]:
if 'tX' in vars(): del tX
if 'tY' in vars(): del tY
if 'XY' in vars(): del XY

In [21]:
# splitting up to make the saving possible (https://github.com/numpy/numpy/issues/2396)
data2 = {}
data2['y_train'] = data['y_train']
data2['y_test'] = data['y_test']
data2['X_test'] = data['X_test']

num_secs = np.ceil((data['X_train'].size * 4) / float((2**(32-1))))
data2['X_train_split'] = np.array_split(data['X_train'], num_secs, axis=0)

In [22]:
# save full dataset to disk

print savedir + savename + '.pkl'

# with open(savedir + savename + '.pkl', 'w') as f:
#     pickle.dump(data, f, -1)
scipy.io.savemat(savedir + savename, data2, do_compression=False, oned_as='column')

/home/michael/projects/engaged_hackathon_data/detection/train_test_patches/unbalanced_256.pkl


In [23]:
print data['X_test'].shape

(30000, 256, 19)


In [None]:
print 378488*128*19*(32/8) / 1e9

## Create a subset of the train/test dataset and save to disk 

In [24]:
# Take a subset of the full train/test split and save to disk
maxi = {}
maxi['train'] = 4000
maxi['test'] = 2000

small_data = {}

for key in ['train', 'test']:
    N = data['X_' + key].shape[0]
    idxs = np.random.permutation(N)[:maxi[key]]
    small_data['X_' + key] = data['X_' + key][idxs, :]
    small_data['y_' + key] = data['y_' + key][idxs]

#     data['X_' + key], data['y_' + key] = balance_classes(data['X_' + key], data['y_' + key])

for key, val in small_data.iteritems():
    print key, val.shape
    
scipy.io.savemat(savedir + savename + '_small.mat', small_data, do_compression=True, oned_as='column')

X_test (2000, 256, 19)
X_train (4000, 1, 256, 19)
y_train (4000,)
y_test (2000,)


In [8]:
print float(6) / float(4)
print (6 + 4 // 2) // 4

1.5
2
