# Extracting training and testing patches from the spectrograms

In [31]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import os
import numpy as np
import collections

import scipy.io.wavfile
from scipy.io import loadmat
from sklearn.cross_validation import train_test_split
import collections

import sys, os
sys.path.append(os.path.expanduser('~/projects/engaged_hackathon/'))
from engaged.features import frequency
from scipy.ndimage.interpolation import zoom
from time import time


In [8]:
# getting a list of all the files
base_path = '/home/michael/projects/engaged_hackathon_data/raw_data/one_minute_files'
files = os.listdir(base_path + '/detection_challenge')
files = [xx.split('.')[0] for xx in files if 'sampled' not in xx]
print len(files)

362


## For each file, extract 1D patches at random locations

Ensure that the same number of positive and negative patches are extracted from each file!

In [9]:
def choose_locations(idxs, maximum, balance=False):
    """
    Given a binary array, the function returns a list of positive and negative locations sampled
    at random from the list.
    Returns 'maximum' locations unless there there are fewer than maximum locations in
    idxs, in which case all are returned
    If balance is true then the classes are balanced to the smaller class size to
    ensure an equal number of each
    """
    false_idxs = np.where(idxs==0)[0]
    true_idxs = np.where(idxs==1)[0]
    
    if false_idxs.shape[0] > maximum and maximum is not None:
        false_idxs = np.random.choice(false_idxs, maximum, replace=False)
        
    if true_idxs.shape[0] > maximum and maximum is not None:
        true_idxs = np.random.choice(true_idxs, maximum, replace=False)
        
    if false_idxs.shape[0] != true_idxs.shape[0] and balance:
        raise Exception("Not implemented!")
        
    return true_idxs, false_idxs
    

def extract_1d_patches(array, locations, hww):
    """
    Extract vertical patches from the array, at the locations given.
    Each slice has a half window width hww
    
    Returns an array of shape:
    (len(locations), array.shape[0], hww*2+1)
    """
    # pad the array to account for overspill
    offset_idxs_np = np.array(locations) + hww
    extra1 = np.tile(array[:, 0], (hww, 1)).T
    extra2 = np.tile(array[:, -1], (hww, 1)).T
    a_temp = np.hstack((extra1, array, extra2))
    
    # set up the array of index locations to extract from
    idxs = [offset_idxs_np]
    for offset in range(1, hww+1):
        idxs.insert(0, offset_idxs_np-offset)
        idxs.append(offset_idxs_np+offset)
    new_idx = np.vstack(idxs).T.ravel()
    
    # extract the patches and do the appropriate reshapgin
    new_shape = (array.shape[0], offset_idxs_np.shape[0], hww*2 + 1)
    to_return = a_temp[:, new_idx].reshape(new_shape).transpose((1, 0, 2))
    return to_return


max_from_each_file = 500
spec_path = '/home/michael/projects/engaged_hackathon_data/detection/spectrograms/'


def process_file(inputs):
    
    count, fname = inputs
    
    # load in wav and convert to spectrogram
    spec = scipy.io.loadmat(spec_path + fname + '_smallspec.mat')['spectrogram']
        
    # load in ground truth
    gt = loadmat(base_path + '/detection_challenge/' + fname + '.mat')
    
    # convert the labels to the sampling rate of the spectrogram
    zoom_factor = float(spec.shape[1]) / float(gt['biotic'][0].shape[0])
    gt_biotic = zoom(gt['biotic'][0], zoom_factor).astype(bool)
    gt_anthrop = zoom(gt['anthropogenic'][0], zoom_factor).astype(bool)
    
    # choosing where to extract from..
    tic = time()    
    true_idxs, false_idxs = choose_locations(gt_biotic, max_from_each_file)
    idxs = np.hstack([true_idxs, false_idxs])
    labels = np.hstack(
        [np.ones(true_idxs.shape), np.zeros(false_idxs.shape)])
    
    # let's group the X here and create a suitable Y vector...
    patches = extract_1d_patches(spec, idxs, hww=9)

    if count % 5 == 0:
        print count,
    
    return patches, labels

print len(files)

XY = map(process_file, enumerate(files))
tX, tY = zip(*XY)

362
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 205 210 215 220 225 230 235 240 245 250 255 260 265 270 275 280 285 290 295 300 305 310 315 320 325 330 335 340 345 350 355 360


# Train/test split and combining data

The split is done at the file level. In the future, I should really do this at location level.

In [39]:
def balance_classes(X, Y):
    """
    Returns X, Y, where there are equal numbers of Y==0 as Y==1.
    """
    positives = np.where(Y == 0)[0]
    negatives = np.where(Y == 1)[0]
    max_examples = min(len(positives), len(negatives))
    
    if len(positives) > max_examples:
        positives = np.random.choice(positives, max_examples, replace=False)
    if len(negatives) > max_examples:
        negatives = np.random.choice(negatives, max_examples, replace=False)
        
    new_X = np.vstack((X[negatives, :], X[positives, :]))
    new_Y = np.hstack((Y[negatives], Y[positives])) 
    
    # return the output in a random order
    shuffle_idxs = np.random.permutation(new_X.shape[0])
    
    return new_X[shuffle_idxs, :], new_Y[shuffle_idxs]
    

# Doing the full train/test split
train_files, test_files = train_test_split(
    range(len(files)), random_state=0, train_size=0.7, test_size=0.3)

# Extracting the data for each side of the split
data = {}
data['X_train'] = np.vstack([tX[idx] for idx in train_files])
data['y_train'] = np.hstack([tY[idx] for idx in train_files]).astype(np.int32).ravel()
data['X_test'] = np.vstack([tX[idx] for idx in test_files])
data['y_test'] = np.hstack([tY[idx] for idx in test_files]).astype(np.int32).ravel()

# Ensuring data is the correct shape
for key in ['X_train', 'X_test']:
    tshape = data[key].shape
    data[key] = data[key].reshape((tshape[0], -1, tshape[1], tshape[2]))
    data[key] = data[key].astype(np.float32)

# balance the classes...
for key in ['train', 'test']:
    data['X_' + key], data['y_' + key] = balance_classes(data['X_' + key], data['y_' + key])
    
# Print details to screen
for key, val in data.iteritems():
    print key, val.shape, val.dtype, val.mean()

# save full dataset to disk
savedir = '/home/michael/projects/engaged_hackathon_data/detection/train_test_patches/'
scipy.io.savemat(savedir + 'full.mat', data, do_compression=True, oned_as='column')

# Also save which are the training and test files
train_filenames = [files[idx] for idx in train_files]
test_filenames = [files[idx] for idx in test_files]
D = dict(train_files=train_filenames, test_files=test_filenames, train_file_idxs=train_files, test_file_idxs=test_files)
scipy.io.savemat(savedir + 'split.mat', D)

X_test (18286, 1, 75, 19) float32 0.170399
X_train (50836, 1, 75, 19) float32 0.175369
y_train (50836,) int32 0.5
y_test (18286,) int32 0.5


## Create a subset of the train/test dataset and save to disk 

In [37]:
# Take a subset of the full train/test split and save to disk
maxi = {}
maxi['train'] = 4000
maxi['test'] = 2000

small_data = {}

for key in ['train', 'test']:
    N = data['X_' + key].shape[0]
    idxs = np.random.permutation(N)[:maxi[key]]
    small_data['X_' + key] = data['X_' + key][idxs, :]
    small_data['y_' + key] = data['y_' + key][idxs]

#     data['X_' + key], data['y_' + key] = balance_classes(data['X_' + key], data['y_' + key])

for key, val in small_data.iteritems():
    print key, val.shape
    
scipy.io.savemat(savedir + 'small.mat', small_data, do_compression=True, oned_as='column')

X_test (2000, 1, 75, 19)
X_train (4000, 1, 75, 19)
y_train (4000,)
y_test (2000,)
