In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import time

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

sys.path.append(os.path.join(os.environ['REPO_DIR'], 'utilities'))
from utilities2015 import *
from metadata import *
from data_manager import *
from learning_utilities import *

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



Setting environment for AWS compute node


No vtk
File does not exist: /shared/CSHL_data_processed/MD635/MD635_anchor.txt
File does not exist: /shared/CSHL_data_processed/MD635/MD635_sorted_filenames.txt
File does not exist: /shared/CSHL_data_processed/MD635/MD635_cropbox.txt
File does not exist: /shared/CSHL_data_processed/MD635/MD635_cropbox.txt


In [3]:
dataset_settings

Unnamed: 0_level_0,network_model,stain,margins,num_sample_per_class,stacks
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20,Inception-BN,nissl,200/500,1000,MD585
21,Inception-BN,nissl,200/500,1000,MD589
22,Inception-BN,nissl,200/500,1000,MD594
99,cell,nissl,500,1000,MD589


In [4]:
dataset_id = 99
dataset_properties = dataset_settings.loc[dataset_id]

num_samples_per_label = dataset_properties['num_sample_per_class']
stacks = dataset_properties['stacks'].split('/')

In [5]:
structures_to_sample = all_known_structures

negative_labels_to_sample = [s + '_negative' for s in structures_to_sample]

margins_to_sample = map(int, str(dataset_properties['margins']).split('/'))

surround_positive_labels_to_sample = [convert_to_surround_name(s, margin=m, suffix=surr_l) 
                             for m in margins_to_sample
                             for s in structures_to_sample 
                             for surr_l in structures_to_sample
                             if surr_l != s]

surround_noclass_labels_to_sample = [convert_to_surround_name(s, margin=m, suffix='noclass') 
                             for m in margins_to_sample
                             for s in structures_to_sample]

labels_to_sample = structures_to_sample + negative_labels_to_sample + \
surround_positive_labels_to_sample + surround_noclass_labels_to_sample + \
['noclass']

In [6]:
from cell_utilities import *

In [7]:
def sample_regions_one_section_by_label(stack, sec, margins_to_sample, labeled_contours, num_samples_per_label=None):
    """
    Sample region addresses in a particular section and associate them with different labels (positive, surround, negative, noclass, foreground, background, etc.).
    
    Args:
        region_contours (list of nx2 arrays): list of contour vertices.
        margins_to_sample (list of ints):
        labeled_contours (dict of nx2 arrays): {label: contour vertices}
        
    Returns:
        dict of 3-tuple list: {label: list of (stack, section, region_index)}        
    """

    addresses = {}

    if is_invalid(stack=stack, sec=sec):
        return

    region_contours = load_cell_classifier_data(what='region_contours', stack=stack, sec=sec, ext='bp')
    region_labels = label_regions(stack=stack, section=sec, 
                                  region_contours=region_contours,
                                  surround_margins=margins_to_sample,
                                  labeled_contours=labeled_contours)

    for label, region_indices in region_labels.iteritems():
        if label == 'bg' or len(region_indices) == 0:
            continue
            
        if num_samples_per_label is None:
            addresses[label] = [(stack, sec, ridx) for ridx in region_indices]
        else:
            sampled_region_indices = np.random.choice(region_indices, min(num_samples_per_label, len(region_indices)), replace=False)
            addresses[label] = [(stack, sec, ridx) for ridx in sampled_region_indices]

    return addresses

In [8]:
def load_cell_based_features_one_section_(stack, section, region_indices):
    """
    Load pre-computed cell-based features for a list of regions on a particular section.
    """
    
    region_features_all_regions = load_cell_classifier_data(what='region_features', stack=stack, sec=section, ext='hdf')
    # Loading hdf ~ 2 seconds.
    
    features1 = np.asarray([rf['largeOrientationHist'] for rf in region_features_all_regions])
    features2 = np.asarray([rf['largeSizeHist'] for rf in region_features_all_regions])
    features3 = np.asarray([rf['largeLargeLinkLenHist'] for rf in region_features_all_regions])
    features4 = np.asarray([rf['largeSmallLinkLenHist'] for rf in region_features_all_regions])
    
        
    f1 = features1[region_indices]
    f1n = f1/f1.sum(axis=1)[:,None].astype(np.float)
    
    f2 = features2[region_indices]
    f2n = f2/f2.sum(axis=1)[:,None].astype(np.float)
    
    f3 = features3[region_indices]
    f3n = f3/f3.sum(axis=1)[:,None].astype(np.float)
    
    f4 = features4[region_indices]
    f4n = f4/f4.sum(axis=1)[:,None].astype(np.float)
    
    features = np.c_[f1n, f2n, f3n, f4n]
    
    return features

In [11]:
def generate_dataset_cell_based(num_samples_per_label, stacks, labels_to_sample):
    """
    Generate dataset.
    - Extract addresses
    - Map addresses to features
    - Remove None features
    
    Returns:
        features, addresseslab
    """
    
    # Sample addresses
    
    addresses = defaultdict(list)
    labels_found = set([])
    addresses_by_section_by_label = []
    
    t = time.time()
    
    for stack in stacks:
        
        first_sec, last_sec = metadata_cache['section_limits'][stack]
    
        t1 = time.time()

        contours_df, _ = DataManager.load_annotation_v3(stack=stack)
        labeled_contours = contours_df[(contours_df['orientation'] == 'sagittal') & (contours_df['downsample'] == 1)].drop_duplicates(subset=['section', 'name', 'side', 'filename', 'downsample', 'creator'])
        labeled_contours = convert_annotation_v3_original_to_aligned_cropped(labeled_contours, stack=stack)

        sys.stderr.write('Load annotation. Time: %.2f seconds.\n' % (time.time() - t1))

        t1 = time.time()
        
        # Sample addresses from each section

        pool = Pool(NUM_CORES/2)
        addresses_by_section_by_label_curr_stack = \
        pool.map(lambda sec: sample_regions_one_section_by_label(stack=stack, sec=sec, 
                                                                 margins_to_sample=margins_to_sample,
                                                                 labeled_contours=labeled_contours[labeled_contours['section']==sec],
                                                                 num_samples_per_label=30),
                 range(first_sec, last_sec+1))
        pool.close()
        pool.join()
                
        addresses_by_section_by_label += addresses_by_section_by_label_curr_stack

        sys.stderr.write('Sample addresses (stack %s): %.2s seconds.\n' % (stack, time.time() - t1))
        
    # Aggregate addresses sampled form each section
    
    addresses_by_label = defaultdict(list)
    for addrs_by_label in addresses_by_section_by_label:
        if addrs_by_label is not None:
            for label, addrs in addrs_by_label.iteritems():
                addresses_by_label[label] += addrs
    addresses_by_label.default_factory = None
    
    if num_samples_per_label is not None:
        import random
        addresses_by_label = {label: random.sample(addrs, min(num_samples_per_label/len(stacks), len(addrs))) 
                              for label, addrs in addresses_by_label.iteritems()}
    
    sys.stderr.write('Sample addresses: %.2f seconds\n' % (time.time() - t))
    
    # Remove unwanted labels
    addresses_by_label = {label: addrs for label, addrs in addresses_by_label.iteritems() if label in labels_to_sample}
    
    # Map addresses to features
    
    t = time.time()
    
    load_cell_based_features_given_address_list = lambda addrs: smart_map(addrs, keyfunc=lambda (st, se, ri): (st, se),
                       func=lambda (st, se), gr: load_cell_based_features_one_section_(st, se, [ri for _,_,ri in gr]))
    features_by_label = apply_function_to_dict(load_cell_based_features_given_address_list, addresses_by_label)
    features_by_label = apply_function_to_dict(np.asarray, addresses_by_label)
    
    sys.stderr.write('Map addresses to features: %.2f seconds\n' % (time.time() - t))
    
    # Remove features that are None

    for name in labels_found:
        valid = [(ftr, addr) for ftr, addr in zip(features_by_label[name], addresses_by_label[name])
                    if ftr is not None]
        res = zip(*valid)
        features_by_label[name] = np.array(res[0])
        addresses_by_label[name] = res[1]
    
    return features_by_label, addresses_by_label

In [12]:
features, addresses, labels_found = generate_dataset_cell_based(num_samples_per_label=num_samples_per_label, 
                                                     stacks=stacks,
                                                     labels_to_sample=['5N'])

'No object named structures in the file'


Annotation has no structures.
Load annotation. Time: 1.40 seconds.
Analyzing section 92..
Analyzing section 93..
Analyzing section 94..
Analyzing section 95..
Analyzing section 96..
Analyzing section 101..
Analyzing section 97..
Analyzing section 102..
Analyzing section 98..
Analyzing section 103..
Analyzing section 110..
Analyzing section 99..
Analyzing section 104..
Analyzing section 100..
Analyzing section 111..
Analyzing section 119..
Analyzing section 105..
Analyzing section 112..
Analyzing section 128..
Analyzing section 120..
Analyzing section 106..
Analyzing section 113..
Analyzing section 137..
Analyzing section 107..
Analyzing section 129..
Analyzing section 121..
Analyzing section 114..
Analyzing section 146..
Analyzing section 108..
Analyzing section 130..
Analyzing section 115..
Analyzing section 122..
Analyzing section 155..
Analyzing section 109..
Analyzing section 138..
Analyzing section 116..
Analyzing section 123..
Analyzing section 147..
Analyzing section 131..
Analy

NameError: global name 'addresses_by_section_curr_stack' is not defined

In [None]:
CLF

In [None]:
# Save training features
features_fp = os.path.join(CLF_ROOTDIR, 'datasets', 'dataset_%d' % dataset_id, 'patch_features.hdf')
create_parent_dir_if_not_exists(features_fp)
save_hdf_v2(features, features_fp)
upload_from_ec2_to_s3(features_fp)
# train_feat_dir = create_if_not_exists(os.path.join(CLF_ROOTDIR, 'datasets', 'dataset_%d' % dataset, 'patch_features'))
# for label, feats in training_features.iteritems():
#     bp.pack_ndarray_file(feats, os.path.join(train_feat_dir, label + '.bp'))

# Save training addresses
addresses_fp = os.path.join(CLF_ROOTDIR, 'datasets', 'dataset_%d' % dataset_id, 'patch_addresses.pkl')
save_pickle(addresses, addresses_fp)
upload_from_ec2_to_s3(addresses_fp)