# Data Preparation

Download bounding box data from here: https://www.kaggle.com/c/the-nature-conservancy-fisheries-monitoring/discussion/25902

Notes: 

After running this notebook you can verify counts using > *`ls | wc -l`* (example: `ls train/* | wc -l` will give you a count of all the images in the training subdirectories)

## Imports and Configuration

In [1]:
import sys, os
sys.path.append(os.path.abspath('../util'))

# core imports
from keras_tf_util import *
from keras.applications.vgg19 import VGG19, preprocess_input, decode_predictions

Using TensorFlow backend.


In [2]:
# configure various jupyter defaults
%matplotlib notebook
plt.rcParams['figure.figsize'] = (9,6)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# configure autoreload to automatically reload modules when files are changed
%load_ext autoreload
%autoreload 2

## Setup

In [3]:
current_dir = os.getcwd()
DATA_HOME_DIR = current_dir + '/data/'

n_validation_files = 500
n_sample_train_files = 400
n_sample_val_files = 200

rebuild_from_data_download = True

## Unzip datasets

In [4]:
if (rebuild_from_data_download == True):
    # cleanup
    if (os.path.exists(DATA_HOME_DIR + 'train')): shutil.rmtree(DATA_HOME_DIR + 'train')
    if (os.path.exists(DATA_HOME_DIR + 'test')): shutil.rmtree(DATA_HOME_DIR + 'test')
    if (os.path.exists(DATA_HOME_DIR + 'val')): shutil.rmtree(DATA_HOME_DIR + 'val')
    if (os.path.exists(DATA_HOME_DIR + 'sample')): shutil.rmtree(DATA_HOME_DIR + 'sample')
    
    # unzip training and test datasets
    with zipfile.ZipFile(DATA_HOME_DIR + 'train.zip', 'r') as zip_ref:
        zip_ref.extractall(DATA_HOME_DIR)
        
    with zipfile.ZipFile(DATA_HOME_DIR + 'test_stg1.zip', 'r') as zip_ref:
        zip_ref.extractall(DATA_HOME_DIR)

## Create validation, test, and sample directories

In [5]:
if (rebuild_from_data_download == True):
    g = glob(DATA_HOME_DIR + 'train/*')
    
    # validation directories
    for d in g: make_dir(DATA_HOME_DIR + 'valid/' + os.path.basename(d))

    # test
    make_dir(DATA_HOME_DIR + 'test_stg1/unknown')

    # sample
    for d in g:
        make_dir(DATA_HOME_DIR + 'sample/train/' + os.path.basename(d))
        make_dir(DATA_HOME_DIR + 'sample/valid/' + os.path.basename(d))

## Move validation and test data into appropriate sub-directories

In [6]:
if (rebuild_from_data_download == True):
    # move n_validation_files from TRAINING into VALIDATION 
    g = glob(DATA_HOME_DIR + 'train/*/*.jpg')
    shuf = np.random.permutation(g)
    
    for i in range(n_validation_files): 
        new_path = '{0}/{1}'.format(os.path.basename(os.path.dirname(shuf[i])), os.path.basename(shuf[i]))
        os.rename(shuf[i], DATA_HOME_DIR + 'valid/' + new_path)
        
    # move TEST images into /unknown subdirectory
    g = glob(DATA_HOME_DIR + 'test_stg1/*')
    for f in g: shutil.move(f, DATA_HOME_DIR + 'test_stg1/unknown')
    

## Copy subset of training and validation data into /sample

In [7]:
if (rebuild_from_data_download == True):
    # copy n_sample_train_files from TRAINING into SAMPLE/TRAIN
    g = glob(DATA_HOME_DIR + 'train/*/*.jpg')
    shuf = np.random.permutation(g)
    
    for i in range(n_sample_train_files): 
        new_path = '{0}/{1}'.format(os.path.basename(os.path.dirname(shuf[i])), os.path.basename(shuf[i]))
        shutil.copyfile(shuf[i], DATA_HOME_DIR + 'sample/train/' + new_path)
        
    # copy n_sample_val_files from VALIDATION into SAMPLE/VALID 
    g = glob(DATA_HOME_DIR + 'valid/*/*.jpg')
    shuf = np.random.permutation(g)
    
    for i in range(n_sample_val_files):
        new_path = '{0}/{1}'.format(os.path.basename(os.path.dirname(shuf[i])), os.path.basename(shuf[i]))
        shutil.copyfile(shuf[i], DATA_HOME_DIR + 'sample/valid/' + new_path)