In [1]:
%matplotlib inline
import random
from pathlib import Path

from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd

from bananas.dataset import DataSet, DataType, Feature

# Root path of project relative to this notebook
ROOT = Path('..')

sys.path.insert(1, str(ROOT / 'scripts'))
from datamodels import *
from utils import *

### Read subject data from local file

In [2]:
df = pd.read_csv(ROOT / 'datasets' / 'subject_drawings_grouped.csv', index_col=0)
df.head()

Unnamed: 0,processed_path_casa,processed_path_circulo,processed_path_cruz,processed_path_cuadrado,processed_path_cubo,processed_path_minimental,processed_path_muelle,processed_path_pico,processed_path_triangulo,diagnosis
002_1,../processed/casaPsic_002Ev1.pdf_pg-18.jpg,../processed/circuloPsic_002Ev1.pdf_pg-17.jpg,../processed/cruzPsic_002Ev1.pdf_pg-17.jpg,../processed/cuadradoPsic_002Ev1.pdf_pg-17.jpg,../processed/cuboPsic_002Ev1.pdf_pg-18.jpg,../processed/minimentalPsic_002Ev1.pdf_pg-3.jpg,../processed/muellePsic_002Ev1.pdf_pg-16.jpg,../processed/picoPsic_002Ev1.pdf_pg-16.jpg,../processed/trianguloPsic_002Ev1.pdf_pg-17.jpg,SANO
002_2,../processed/casaPsic_002Ev2.pdf_pg-10.jpg,../processed/circuloPsic_002Ev2.pdf_pg-9.jpg,../processed/cruzPsic_002Ev2.pdf_pg-9.jpg,../processed/cuadradoPsic_002Ev2.pdf_pg-9.jpg,../processed/cuboPsic_002Ev2.pdf_pg-10.jpg,../processed/minimentalPsic_002Ev2.pdf_pg-3.jpg,../processed/muellePsic_002Ev2.pdf_pg-8.jpg,../processed/picoPsic_002Ev2.pdf_pg-8.jpg,../processed/trianguloPsic_002Ev2.pdf_pg-9.jpg,SANO
003_1,../processed/casaPsic_003Ev1.pdf_pg-16.jpg,../processed/circuloPsic_003Ev1.pdf_pg-14.jpg,../processed/cruzPsic_003Ev1.pdf_pg-14.jpg,../processed/cuadradoPsic_003Ev1.pdf_pg-14.jpg,../processed/cuboPsic_003Ev1.pdf_pg-16.jpg,../processed/minimentalPsic_003Ev1.pdf_pg-5.jpg,../processed/muellePsic_003Ev1.pdf_pg-15.jpg,../processed/picoPsic_003Ev1.pdf_pg-15.jpg,../processed/trianguloPsic_003Ev1.pdf_pg-14.jpg,SANO
003_3,../processed/casaPsic_003Ev3.pdf_pg-16.jpg,../processed/circuloPsic_003Ev3.pdf_pg-14.jpg,../processed/cruzPsic_003Ev3.pdf_pg-14.jpg,../processed/cuadradoPsic_003Ev3.pdf_pg-14.jpg,../processed/cuboPsic_003Ev3.pdf_pg-16.jpg,../processed/minimentalPsic_003Ev3.pdf_pg-4.jpg,../processed/muellePsic_003Ev3.pdf_pg-18.jpg,../processed/picoPsic_003Ev3.pdf_pg-18.jpg,../processed/trianguloPsic_003Ev3.pdf_pg-14.jpg,SANO
004_1,../processed/casaPsic_004Ev1.pdf_pg-9.jpg,../processed/circuloPsic_004Ev1.pdf_pg-8.jpg,../processed/cruzPsic_004Ev1.pdf_pg-8.jpg,../processed/cuadradoPsic_004Ev1.pdf_pg-8.jpg,../processed/cuboPsic_004Ev1.pdf_pg-9.jpg,../processed/minimentalPsic_004Ev1.pdf_pg-23.jpg,../processed/muellePsic_004Ev1.pdf_pg-7.jpg,../processed/picoPsic_004Ev1.pdf_pg-7.jpg,../processed/trianguloPsic_004Ev1.pdf_pg-8.jpg,SANO


### Load custom MLP model

In [3]:
from quick_draw_learner import QDClassifier

### Build and train...

In [4]:
from itertools import product, combinations
    
# Define all possible hyperparameters
batch_sizes = [24, 32]
test_splits = [.2, .25]
validation_splits = [.2, .25]
skip_cats_opts = [
    'pico',
    'muelle',
    'minimental']
skip_cats_combos = sum([list(combinations(skip_cats_opts, i))
                        for i in range(len(skip_cats_opts))], [])

# Initialize random number generator without seed to randomize hyperparamters
rnd = np.random.RandomState()

# Cross product all hyperparameters
parameter_combinations = list(product(
    batch_sizes, test_splits, validation_splits, skip_cats_combos))
rnd.shuffle(parameter_combinations)

target_label = 'SANO'
target_column = 'diagnosis'
category_columns = [col for col in df.columns if col != target_column]
category_columns

['processed_path_casa',
 'processed_path_circulo',
 'processed_path_cruz',
 'processed_path_cuadrado',
 'processed_path_cubo',
 'processed_path_minimental',
 'processed_path_muelle',
 'processed_path_pico',
 'processed_path_triangulo']

In [None]:
from bananas.sampling.cross_validation import DataSplit
from bananas.statistics import scoring
from bananas.statistics.scoring import ScoringFunction

# Store results in a list to display them later
trial_results = []

for batch_size, test_split, validation_split, skip_cats in tqdm(parameter_combinations, leave=False):

    # Re-initialize seed every time
    random_seed = 0

    # Create a single feature containing all image data
    cats = [cat for cat in category_columns if ('processed_path_%s' % cat) not in skip_cats]
    features = [Feature(
        ImageAugmenterMultiLoader(df[cats].values),
        kind=DataType.HIGH_DIMENSIOAL,
        sample_size=10,
        random_seed=random_seed)]

    # Define target feature
    target_feature = Feature(
        (df[target_column] == target_label).values, random_seed=random_seed)

    while True:

        # Build dataset, making sure that we have a left-out validation subset
        dataset = DataSet(
            features,
            name=target_label,
            target=target_feature,
            random_seed=random_seed,
            batch_size=batch_size,
            test_split=test_split,
            validation_split=validation_split)

        # Compute test class balance to tell what minimum accuracy we should beat
        test_idx = dataset.sampler.subsamplers[DataSplit.VALIDATION].data
        test_classes = target_feature[test_idx]
        test_class_balance = sum(test_classes) / len(test_classes)

        # Rebuild dataset unless test class balance is within 5% of ground truth
        true_class_balance = sum(target_feature[:] / len(target_feature))
        if abs(test_class_balance - true_class_balance) < .05: break

        # Keep changing the seed to avoid getting stuck
        random_seed += 1

    # Instantiate learner using pre-trained model
    learner = QDClassifier(input_channel_count=len(cats),
                           random_seed=random_seed, verbose=False)

    # Train learner using train dataset
    learner.train(dataset.input_fn, progress=True, max_steps=100)

    # Test learner predictions using left-out validation dataset
    X, y = dataset[test_idx]
    y = learner.label_encoder_.transform(y)
    y_ = learner.predict_proba(X)
    score_auroc = scoring.score_auroc(y, y_)
    score_accuracy = scoring.score_accuracy(y, y_)
    score_precision = scoring.score_precision(y, y_)
    score_recall = scoring.score_recall(y, y_)

    # Store trial results
    naive_accuracy = max(test_class_balance, 1 - test_class_balance)
    trial_results.append({
        'Subset splits': (test_split, validation_split),
        'Skipped categories': ', '.join(skip_cats),
        'Batch size': batch_size,
        'Δ Naive Classifier': score_accuracy - naive_accuracy,
        'Accuracy': score_accuracy,
        'Precision': score_precision,
        'Recall': score_recall,
        'Area under ROC': score_auroc,
    })

In [7]:
pd.DataFrame.from_records(trial_results) \
    .sort_values('Accuracy', ascending=False).head(10)

Unnamed: 0,Subset splits,Skipped categories,Batch size,Δ Naive Classifier,Accuracy,Precision,Recall,Area under ROC
36,"(0.25, 0.25)",,32,0.238095,0.761905,0.0,0.0,0.715795
11,"(0.25, 0.25)","muelle, minimental",32,0.190476,0.714286,0.0,0.0,0.574081
37,"(0.25, 0.25)",muelle,32,0.142857,0.666667,0.0,0.0,0.588771
47,"(0.25, 0.25)","pico, muelle",32,0.095238,0.619048,0.0,0.0,0.644855
26,"(0.25, 0.25)",minimental,32,0.047619,0.571429,0.0,0.0,0.629545
31,"(0.25, 0.25)",pico,32,0.047619,0.571429,0.0,0.0,0.579339
4,"(0.25, 0.25)","pico, minimental",32,0.047619,0.571429,0.0,0.0,0.586157
23,"(0.25, 0.25)","pico, minimental",24,0.047619,0.571429,0.0,0.0,0.611364
22,"(0.25, 0.25)",muelle,24,0.0,0.52381,0.0,0.0,0.567045
40,"(0.25, 0.25)","pico, muelle",24,0.0,0.52381,0.0,0.0,0.559091
