In [None]:
%matplotlib inline

import sys
from pathlib import Path

from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd

from bananas.utils import images
from bananas.utils.arrays import unique
from bananas.dataset import DataSet, DataType, Feature

# Root path of project relative to this notebook
ROOT = Path('..')

sys.path.insert(1, str(ROOT / 'scripts'))
from datamodels import *
from utils import *

### Read patient data from local file

In [None]:
df = pd.read_csv(ROOT / 'datasets' / 'subject_diagnosis.csv', index_col=0)

# Convert non-primitive fields
df['processed_path'] = df['processed_path'].apply(lambda x: Path(x))
df['image_path'] = df['image_path'].apply(lambda x: Path(x))
df['template_path'] = df['template_path'].apply(lambda x: Path(x))
df['drawing_box'] = df['drawing_box'].apply(lambda x: Box.load(x))
df['template_box'] = df['template_box'].apply(lambda x: Box.load(x))

# Remove all unnecessary columns from our dataset
feat_keys = ['processed_path']
group_columns = ['diagnosis']
df = df[group_columns + feat_keys].copy()

# Normalize all feature columns
df = df.dropna()
for col in feat_keys:
    df[col] = df[col].apply(lambda x: str(ROOT / x))

df.head()

### Load custom MLP model

In [None]:
from quick_draw_learner import QDClassifier

### Build and train...

In [None]:
from itertools import product, combinations
    
# Define all possible hyperparameters
kernel_sizes = [2 ** i - 1 for i in (3, 4, 5)]
batch_sizes = [24, 32]
test_splits = [.2, .25]
validation_splits = [.2, .25]
skip_cats_opts = [
    'pico',
    'muelle',
    'minimental']
skip_cats_combos = sum([list(combinations(skip_cats_opts, i))
                        for i in range(len(skip_cats_opts))], [])

# Initialize random number generator without seed to randomize hyperparamters
rnd = np.random.RandomState()

# Cross product all hyperparameters
parameter_combinations = list(product(
    kernel_sizes, batch_sizes, test_splits, validation_splits, skip_cats_combos))
rnd.shuffle(parameter_combinations)

target_label = 'SANO'
target_column = 'diagnosis'

In [None]:
from bananas.sampling.cross_validation import DataSplit
from bananas.statistics import scoring
from bananas.statistics.scoring import ScoringFunction

# Store results in a list to display them later
trial_results = []

for kernel_size, batch_size, test_split, validation_split, skip_cats in tqdm(parameter_combinations, leave=False):

    # Re-initialize seed every time
    random_seed = 0

    # Create a single feature containing all image data
    mask = df['processed_path'].astype(str).apply(
        lambda impath: any([('processed/%s' % cat) in impath for cat in skip_cats]))
    features = [Feature(
        ImageAugmenterLoader(df.loc[~mask, 'processed_path'].values),
        kind=DataType.HIGH_DIMENSIOAL,
        sample_size=10,
        random_seed=random_seed)]

    # Define target feature
    target_feature = Feature(
        (df.loc[~mask, target_column] == target_label).values, random_seed=random_seed)

    while True:

        # Build dataset, making sure that we have a left-out validation subset
        dataset = DataSet(
            features,
            name=target_label,
            target=target_feature,
            random_seed=random_seed,
            batch_size=batch_size,
            test_split=test_split,
            validation_split=validation_split)

        # Compute test class balance to tell what minimum accuracy we should beat
        test_idx = dataset.sampler.subsamplers[DataSplit.VALIDATION].data
        test_classes = target_feature[test_idx]
        test_class_balance = sum(test_classes) / len(test_classes)

        # Rebuild dataset unless test class balance is within 5% of ground truth
        true_class_balance = sum(target_feature[:] / len(target_feature))
        if abs(test_class_balance - true_class_balance) < .05: break

        # Keep changing the seed to avoid getting stuck
        random_seed += 1

    # Instantiate learner
    learner = QDClassifier(
        kernel_size=kernel_size,
        random_seed=random_seed, 
        verbose=False)

    # Train learner using train dataset
    learner.train(dataset.input_fn, progress=True, max_steps=200)

    # Test learner predictions using left-out dataset
    X, y = dataset[test_idx]
    y = learner.label_encoder_.transform(y)
    y_ = learner.predict_proba(X)
    score_auroc = scoring.score_auroc(y, y_)
    score_accuracy = scoring.score_accuracy(y, y_)

    # Store trial results
    naive_accuracy = max(test_class_balance, 1 - test_class_balance)
    trial_results.append({
        'Subset Splits': (test_split, validation_split),
        'Kernel size': kernel_size,
        'Batch size': batch_size,
        'Skipped Categories': ', '.join(skip_cats),
        'Δ Naive Classifier': score_accuracy - naive_accuracy,
        'Accuracy': score_accuracy,
        'Area under ROC': score_auroc,
    })

In [None]:
pd.DataFrame.from_records(trial_results) \
    .sort_values('Accuracy', ascending=False).head(10)