# Creating detection dataset

A one-off notebook to create a labelling for each moment of the one minute dataset, saying if there is human noise, animal noise or both

In [97]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt

import os
import sys
import cPickle as pickle

from data_helpers import load_annotations

where_to_save = '/media/michael/Seagate/engage/alison_data/golden_set/extracted/annotations/'
base_path = '/media/michael/Seagate/engage/alison_data/golden_set/labels/Golden/'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
# load in the annotations
for fname in os.listdir(base_path):
    savename = where_to_save + fname
    if os.path.exists(savename):
        continue
    
    # load the annottion
    annots, wav, sample_rate = load_annotations(fname)
    
    # save to disk
    with open(savename, 'w') as f:
        pickle.dump((annots, wav, sample_rate), f, -1)

In [99]:
# Creating a data split
import collections
from sklearn.cross_validation import train_test_split, LabelKFold
import numpy as np
import yaml

np.random.seed(10)

files = [xx.replace('-sceneRect.csv', '') for xx in os.listdir(base_path)]

# dict mapping locations to filenames
loc_to_fname = collections.defaultdict(list)
for fname in files:
    loc_to_fname[fname.split('-')[0]].append(fname)

In [100]:
splits = []
locs = loc_to_fname.keys()

for train_idx, test_idx in LabelKFold(locs):
    train_locs = [locs[xx] for xx in train_idx]
    test_locs = [locs[xx] for xx in test_idx]
    
    train_files = [xx for loc in train_locs for xx in loc_to_fname[loc]]
    test_files = [xx for loc in test_locs for xx in loc_to_fname[loc]]
    
    split = dict(train_files=train_files, test_files=test_files, train_locs=train_locs, test_locs=test_locs)
    splits.append(split)
    

with open(where_to_save + '../splits/3-fold_split.yaml', 'w') as f:
    yaml.dump(splits, f, default_flow_style=False)
