In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import time

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

sys.path.append(os.path.join(os.environ['REPO_DIR'], 'utilities'))
from utilities2015 import *
from metadata import *
from data_manager import *
from learning_utilities import *

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



Setting environment for AWS compute node


No vtk
File does not exist: /shared/CSHL_data_processed/MD635/MD635_anchor.txt
File does not exist: /shared/CSHL_data_processed/MD635/MD635_sorted_filenames.txt
File does not exist: /shared/CSHL_data_processed/MD635/MD635_cropbox.txt
File does not exist: /shared/CSHL_data_processed/MD635/MD635_cropbox.txt


In [3]:
dataset_settings

Unnamed: 0_level_0,network_model,stain,margins,num_sample_per_class,stacks
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20,Inception-BN,nissl,200/500,1000,MD585
21,Inception-BN,nissl,200/500,1000,MD589
22,Inception-BN,nissl,200/500,1000,MD594
99,cell,nissl,500,1000,MD589


In [4]:
dataset_id = 99
dataset_properties = dataset_settings.loc[dataset_id]

num_samples_per_label = dataset_properties['num_sample_per_class']
stacks = dataset_properties['stacks'].split('/')

In [5]:
structures_to_sample = all_known_structures

negative_labels_to_sample = [s + '_negative' for s in structures_to_sample]

margins_to_sample = map(int, str(dataset_properties['margins']).split('/'))

surround_positive_labels_to_sample = [convert_to_surround_name(s, margin=m, suffix=surr_l) 
                             for m in margins_to_sample
                             for s in structures_to_sample 
                             for surr_l in structures_to_sample
                             if surr_l != s]

surround_noclass_labels_to_sample = [convert_to_surround_name(s, margin=m, suffix='noclass') 
                             for m in margins_to_sample
                             for s in structures_to_sample]

labels_to_sample = structures_to_sample + negative_labels_to_sample + \
surround_positive_labels_to_sample + surround_noclass_labels_to_sample + \
['noclass']

In [7]:
from cell_utilities import *

In [22]:
def sample_regions_one_section_by_label(stack, sec, margins_to_sample, labeled_contours, num_sample_per_label=10):
    """
    Sample region addresses in a particular section and associate them with different labels (positive, surround, negative, noclass, foreground, background, etc.).
    
    Args:
        region_contours (list of nx2 arrays): list of contour vertices.
        margins_to_sample (list of ints):
        labeled_contours (dict of nx2 arrays): {label: contour vertices}
        
    Returns:
        dict of 3-tuples (stack, section, region_index)
        
    """

    training_addresses = {}

    if is_invalid(stack=stack, sec=sec):
        return

    region_contours = load_cell_classifier_data(what='region_contours', stack=stack, sec=sec, ext='bp')
    region_labels = label_regions(stack=stack, section=sec, 
                                  region_contours=region_contours,
                                  surround_margins=margins_to_sample,
                                  labeled_contours=labeled_contours)

    for label, region_indices in region_labels.iteritems():
        if label == 'bg' or len(region_indices) == 0:
            continue
        sampled_region_indices = np.random.choice(region_indices, min(num_sample_per_label, len(region_indices)), replace=False)
        training_addresses[label] = [(stack, sec, ridx) for ridx in sampled_region_indices]

    return training_addresses

In [23]:
def generate_dataset_cell_based(num_samples_per_label, stacks, labels_to_sample):
    """
    Generate dataset.
    - Extract addresses
    - Map addresses to features
    - Remove None features
    
    Returns:
        features, addresses, labels_found
    """
    
    # Extract addresses
    
    addresses = defaultdict(list)

    labels_found = set([])

    t = time.time()
    
    for stack in stacks:
        
        first_sec, last_sec = metadata_cache['section_limits'][stack]
    
        t1 = time.time()

        contours_df, _ = DataManager.load_annotation_v3(stack=stack)
        labeled_contours = contours_df[(contours_df['orientation'] == 'sagittal') & (contours_df['downsample'] == 1)].drop_duplicates(subset=['section', 'name', 'side', 'filename', 'downsample', 'creator'])
        labeled_contours = convert_annotation_v3_original_to_aligned_cropped(labeled_contours, stack=stack)

        sys.stderr.write('Load annotation. Time: %.2s seconds.\n' % (time.time() - t1))

        t1 = time.time()
    
        pool = Pool(NUM_CORES/2)
        addresses_by_section_curr_stack = \
        pool.map(lambda sec: sample_regions_by_label(stack=stack, sec=sec, margins_to_sample=margins_to_sample,
                                            labeled_contours=labeled_contours[labeled_contours['section'] == sec],
                                           num_sample_per_label=10), 
                 range(first_sec, last_sec+1))
        pool.close()
        pool.join()

        addresses_by_section += addresses_by_section_curr_stack

        sys.stderr.write('Sample training addresses. Time: %.2s seconds.\n' % (time.time() - t1)) # 13 seconds.
        
        addresses = defaultdict(list)
        for addrs_by_label in addresses_by_section:
            if addrs_by_label is None: continue
            for label, addrs in addrs_by_label.iteritems():
                addresses[label] += addrs
        
    addresses.default_factory = None
    
    sys.stderr.write('Sample addresses: %.2f seconds\n' % (time.time() - t))
    
    # Map addresses to features
    
    t = time.time()
    # test_features = apply_function_to_dict(lambda x: addresses_to_features(x, model_name=model_name), test_addresses)
    features = apply_function_to_dict(lambda x: addresses_to_features_parallel(x, model_name=model_name, n_processes=4), addresses)
    sys.stderr.write('Map addresses to features: %.2f seconds\n' % (time.time() - t))
    
    # Remove features that are None

    for name in labels_found:
        valid = [(ftr, addr) for ftr, addr in zip(features[name], addresses[name])
                    if ftr is not None]
        res = zip(*valid)
        features[name] = np.array(res[0])
        addresses[name] = res[1]
        
    return features, addresses, labels_found

In [24]:
features, addresses, labels_found = generate_dataset_cell_based(num_samples_per_label=num_samples_per_label, 
                                                     stacks=stacks,
                                                     labels_to_sample=labels_to_sample)

'No object named structures in the file'


Annotation has no structures.
Load annotation. Time: 2. seconds.


TypeError: sample_regions_by_label() takes at least 5 arguments (5 given)