## Config

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from common import *
from competitions import dogscats;
from pathlib import Path

'''
Command Line Tool
1) User 'create_project' page with name and directory ?
2) Redirect to the label page (with project_name as the argument) <--- interesting to figure this out
3) Call 'init_dataset' and display images on page (refactor so it doesn't require label names ahead) - simpler ATM
4) Manually kick off the infinite loop and monitor
5) Start labeling images and watch the infinite loop
6) Make sure model accuracy is increasing and label counts
12) Create new project and test end-to-end that things are updating/training/saving etc.


Uncertainty Filtering
13) Update 'Next' call to return most uncertain images (update Flask API)
    -load labelai.csv prediction into Pandas DF
    -filter for unlabeled images (no userTags)
    -if len(unlabeled < N_REQUESTED)
        return all_unlabeled
    -call get_most_uncertain(df, n)
    -if len(uncertain < N_REQUESTED)
        -get_basic_unlabeled(df, n)
14) Get_most_uncertain(df, n)   <------- binary classification for now
    -Filter for unlabeled with modelPreds
    -shuffle
    -min_prob = .4, max_prob = .6
    -fnames = []
    while len(fnames < n):
        for i in df.size():
            -if modelProb > min and modelProb < max:
                fnames.append(df.iloc[i].pop()) <--- need to pop()
        min_prob -= .1
        max_prob += .1
    return fnames
''';

ImportError: No module named torch

In [None]:
TRAIN_PATH = cfg.PATHS['datasets']['inputs']['trn_jpg'] 
TEST_PATH = cfg.PATHS['datasets']['inputs']['tst_jpg'] 
LABELS_PATH = os.path.join(cfg.PATHS['project'], 'labels')

HARDWARE_CONFIG = {
    'hostname': socket.gethostname(),
    'random_seed': 3,
    'gpu_device':0
}
torch.cuda.set_device(HARDWARE_CONFIG['gpu_device'])
cudnn.benchmark = True

DATA_CONFIG = {
    'img_rescale': 256,
    'dset_fold': 'labelai',
    'n_classes': len(dogscats.LABEL_NAMES),
    'label_names': dogscats.LABEL_NAMES
}

TRAIN_CONFIG = {
    'initial_lr': 1e-4,
    'weight_decay': 1e-4,
    'n_epochs': 50,
    'n_cycles': 9,
    'early_stop_metric': metric.Loss().name,
    'max_patience': 5,
    'batch_size': 32,
    'threshold': 0.5,
    'save_weights_cadence': 1, #every epoch
    'lr_schedule': {50:1e-4}
}
OTHER_CONFIG = {}

TRANSFORMS = {
    c.TRAIN: torchsample.transforms.Compose([
        transforms.Scale(size=[DATA_CONFIG['img_rescale'], 
                               DATA_CONFIG['img_rescale']]),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        data_aug.IMAGENET_NORMALIZE
    ]),
    c.VAL: torchsample.transforms.Compose([
        transforms.Scale(size=[DATA_CONFIG['img_rescale'], 
                               DATA_CONFIG['img_rescale']]),
        transforms.ToTensor(),
        data_aug.IMAGENET_NORMALIZE
    ]),
    c.TEST: torchsample.transforms.Compose([
        transforms.Scale(size=[DATA_CONFIG['img_rescale'], 
                               DATA_CONFIG['img_rescale']]),
        transforms.ToTensor(),
        data_aug.IMAGENET_NORMALIZE
    ]),
    c.UNLABELED: torchsample.transforms.Compose([
        transforms.Scale(size=[DATA_CONFIG['img_rescale'], 
                               DATA_CONFIG['img_rescale']]),
        transforms.ToTensor(),
        data_aug.IMAGENET_NORMALIZE
    ])
}

def get_labels_fpath(name):
    return os.path.join(cfg.PATHS['labels'], name, 'labels.json')

def get_scores_fpath(name):
    return os.path.join(cfg.PATHS['labels'], name, 'metrics.json')

def get_preds_fpath(name):
    return os.path.join(cfg.PATHS['labels'], name, 'predictions.json')

def get_uncertainty_fpath(name):
    return os.path.join(cfg.PATHS['labels'], name, 'rankings.csv')

def init_dataset(name, input_dir, file_ext, label_names=None):
    fpaths, ids = utils.files.get_paths_to_files(input_dir, strip_ext=True)
    label_names = [] if label_names is None else label_names
    fold = {
        'name': name,
        'file_ext': file_ext,
        'inputs_dir': input_dir,
        'label_names': sorted(label_names),
        'trn': {},
        'val': {},
        'tst': {}, #auditing purposes
        'unlabeled': {}, #these need to be queried and popped by key
        'metrics': {},
        'created': time.strftime("%m/%d/%Y %H:%M:%S", time.localtime())
    }
    for id_ in ids:
        fold['unlabeled'][id_] = id_
    os.makedirs(os.path.join(LABELS_PATH, name), exist_ok=True)
    fold_fpath = get_labels_fpath(name)
    utils.files.save_json(fold_fpath, fold)
    return fold

def make_entry(labels=None, model_labels=None, model_probs=None):
    labels = [] if labels is None else labels
    model_labels = [] if model_labels is None else model_labels
    model_probs = [] if model_probs is None else model_probs
    return {
        'labels': labels,
        'model_labels': model_labels,
        'model_probs': model_probs,
    }

def add_or_update_entry(fold, dset, id_, entry):
    fold[dset][id_] = entry

def move_unlabeled_to_labeled(fold, dset, id_, entry):
    del fold['unlabeled'][id_]
    add_or_update_entry(fold, dset, id_, entry)

def get_model(fold):
    resnet = models.resnet.get_resnet34(pretrained=True, n_freeze=10**5, verbose=False)
    resnet = models.builder.cut_model(resnet, -1)
    classifier = models.builder.get_classifier(in_feat=512, n_classes=len(fold['label_names']),
                                               activation=nn.Softmax(), p=0.5)
    model = models.resnet.SimpleResnet(resnet, classifier)
    return model.cuda()

def get_loader(fold, dset, shuffle):
    fpaths, targs = metadata.get_fpaths_targs_from_label_fold(fold, dset)
    data = datasets.datasets.FileDataset(fpaths, 'pil', targs, TRANSFORMS[dset])
    return data_loaders.get_data_loader(data, TRAIN_CONFIG['batch_size'], 
                                        shuffle=shuffle, n_workers=4, pin_memory=True)

def get_criterion():
    return F.binary_cross_entropy

def get_optimizer(model):
    return optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 
        TRAIN_CONFIG['initial_lr'], weight_decay=TRAIN_CONFIG['weight_decay'])

def get_lr_adjuster():
    return learning_rates.ScheduledLR(TRAIN_CONFIG['initial_lr'], 'epoch', 
                                      TRAIN_CONFIG['lr_schedule'])

def get_trainer(crit, optim, lr_adjuster):
    return trainers.Trainer(crit, crit, optim, lr_adjuster)

def load_weights(model, exp_name):
    exp_utils.load_weights_by_exp_and_epoch(model, exp_name, epoch)
    
def make_config(proj_name, model, optimizer, criterion, lr_adjuster):
    EXP_NAME_ARGS = [utils.general.get_class_name(model), 
                    utils.general.get_class_name(optimizer), 
                    utils.general.get_class_name(lr_adjuster),
                    'img'+str(DATA_CONFIG['img_rescale']),
                    'lr'+str(TRAIN_CONFIG['initial_lr']),
                    'wd'+str(TRAIN_CONFIG['weight_decay']),
                    'bs'+str(TRAIN_CONFIG['batch_size']),
                    str(DATA_CONFIG['dset_fold'])]
    EXPERIMENT_NAME = exp_utils.generate_display_name(proj_name, EXP_NAME_ARGS)
    METRICS = [metric.Loss(), metric.Accuracy(), metric.F2Score()]
    AUX_METRICS = [metric.AuxiliaryMetric('LearningRate', 'lr'), 
                   metric.AuxiliaryMetric('SystemMemory', 'mb')]
    VISUALIZERS = [Viz(EXPERIMENT_NAME)]
    return {
        'name': EXPERIMENT_NAME,
        'parent_dir': cfg.PATHS['experiments'],
        'metrics': METRICS,
        'aux_metrics': AUX_METRICS,
        'visualizers': VISUALIZERS,
        'data': DATA_CONFIG,
        'training': TRAIN_CONFIG,
        'other': OTHER_CONFIG,
        'transforms': TRANSFORMS[c.TRAIN],
        'hardware': HARDWARE_CONFIG,
        'model': model,
        'optimizer': optimizer,
        'lr_adjuster': lr_adjuster,
        'criterion': criterion }

def create_experiment(config):
    exp = Experiment(config['name'], cfg.PATHS['experiments'])
    exp.init(config)
    print(exp.name)
    return exp

def resume_experiment(name):
    exp = Experiment(name, cfg.PATHS['experiments'])
    exp.resume(verbose=False)
    return exp

def create_project(name, img_path, label_names):
    fold_fpath = get_labels_fpath(name)
    fold = init_dataset(img_path, fold_fpath, c.JPG_EXT, label_names)
    return fold

def get_img_count(fold, dset):
    return len(fold[dset].keys())

def get_img_counts(proj_name):
    fold = load_fold(proj_name)
    return {
        c.TRAIN: get_img_count(fold, c.TRAIN),
        c.VAL: get_img_count(fold, c.VAL),
        c.TEST: get_img_count(fold, c.TEST),
        c.UNLABELED: get_img_count(fold, c.UNLABELED)
    }

def load_scores(fpath):
    if os.path.isfile(fpath):
        return utils.files.load_json(fpath)
    return {
        "experiments":{}, 
        "latest":{},
        "counts":{}
    }
    
def get_preds(exp, loader):
    probs = predictions.get_probabilities(exp.model, loader)
    preds = predictions.get_predictions(probs, 0.5)
    return probs, preds

def save_scores(exp, proj_name, loader):
    print("Saving scores")
    probs, preds = get_preds(exp, loader)
    targs = loader.dataset.targets
    loss = metric_utils.get_cross_entropy_loss(probs, targs)
    
    scores_fpath = get_scores_fpath(proj_name)
    scores = load_scores(scores_fpath)
    scores["experiments"][exp.name] = exp.history.metrics_history
    scores["counts"] = get_img_counts(proj_name)
    scores["experiments"][exp.name]['created'] = time.strftime(
        "%m/%d/%Y %H:%M:%S", time.localtime())
    for m in exp.metrics:
        scores["latest"][m.name] = m.evaluate(
            loss, preds, probs, targs)
    utils.files.save_json(scores_fpath, scores)
    
def load_fold(name):
    fpath = get_labels_fpath(name)
    return utils.files.load_json(fpath)

def save_fold(fold):
    fpath = get_labels_fpath(fold['name'])
    return utils.files.save_json(fpath, fold)    

def uncertainty_sort(preds_df):
    """
    single argmax - closest to .5
    margin - top two argmax different
    entropy - 
    """
    
def build_argmax_df(probs, ids, labels):
    argmax_idxs = np.argmax(probs, axis=1).astype('uint8')
    max_vals = probs[np.arange(len(probs)), list(argmax_idxs)].reshape(-1, 1)
    probs_w_max_vals = np.concatenate([probs, max_vals], axis=1)
    columns = labels + ["max_val"]
    pred_df = pd.DataFrame(data=probs_w_max_vals, index=ids, columns=columns)
    pred_df.sort_values(by="max_val", inplace=True, )
    return pred_df

def save_preds(exp, proj_name):
    print("Saving predictions")
    fpaths, ids = utils.files.get_paths_to_files(TRAIN_PATH, strip_ext=True)
    data = datasets.datasets.FileDataset(fpaths, 'pil', None, TRANSFORMS[c.VAL])
    loader = data_loaders.get_data_loader(
        data, TRAIN_CONFIG['batch_size'], n_workers=2)
    probs, preds = get_preds(exp, loader)
    tags = metadata.get_tags_from_preds(
        preds, exp.config.data['label_names'])
    
    pred_df = build_argmax_df(probs, ids, dogscats.LABEL_NAMES)
    pred_df.to_csv(get_uncertainty_fpath(proj_name))
    pred_doc = {}
    for idx,id_ in enumerate(ids):
        pred_doc[id_] = {
            'labels': tags[idx],
            'probs': probs[idx].tolist()
        }
    preds_fpath = get_preds_fpath(proj_name)
    utils.files.save_json(preds_fpath, pred_doc)
    return probs, preds

MAX_RUNS = 1000
def run_project(proj_name):
    n_trn_imgs = 0
    for i in range(MAX_RUNS):
        fold_fpath = get_labels_fpath(proj_name)
        fold = utils.files.load_json(fold_fpath)
        trn_count = get_img_count(fold, c.TRAIN)
        if trn_count > n_trn_imgs:
            print("Found new trn images")
            exp = run_experiment(proj_name)
            del exp
            n_trn_imgs = trn_count
        else:
            print("No new trn images, sleeping")
            time.sleep(10)

def run_experiment(proj_name):
    print("Starting Experiment")
    fold = utils.files.load_json(get_labels_fpath(proj_name))
    trn_loader = get_loader(fold, c.TRAIN, shuffle=True)
    val_loader = get_loader(fold, c.VAL, shuffle=False)
    model = get_model(fold)
    crit = get_criterion()
    optim = get_optimizer(model)
    lr_adjuster = get_lr_adjuster()
    trainer = get_trainer(crit, optim, lr_adjuster)
    config = make_config(proj_name, model, optim, crit, lr_adjuster)
    exp = create_experiment(config)
    exp.train(trainer, trn_loader, val_loader)
    exp_utils.load_weights_by_exp_and_epoch(exp.model, exp.name, 
                                            exp.best_epoch)
    probs, preds = save_preds(exp, fold['name'])
    save_scores(exp, proj_name, val_loader)
    return exp

## Train

In [None]:
PROJECT_NAME = 'test_project'
_ = init_dataset(PROJECT_NAME, TRAIN_PATH, c.JPG_EXT, label_names=DATA_CONFIG['label_names'])

In [3]:
run_project(PROJECT_NAME)

NameError: name 'run_project' is not defined

## Predict

In [None]:
# Load model from exp epoch
exp.load_model_state(epoch=49)
model = exp.model

In [None]:
# OR load custom model weights
exp_name = RESUME_EXP_NAME
w_path = os.path.join(cfg.PATHS['experiments'], exp_name, 'weights', 'weights-30.th')
models.utils.load_weights(model, w_path)

In [None]:
%time val_probs = predictions.get_probabilities(model, tst_loader)

In [None]:
metric_utils.get_accuracy(val_probs > 0.5, tst_targs)

In [None]:
ks = list(label_fold['val'].keys())
print(len(ks), len(val_loader.dataset.fpaths))
for k in ks:
    fpath = os.path.join(TRAIN_PATH, k+c.JPG_EXT)
    assert fpath in val_fpaths
    assert fpath in val_loader.dataset.fpaths

In [None]:
for f,v in zip(val_fpaths, val_targs):
    print(os.path.basename(f),metadata.convert_one_hot_to_tags(v, labels))

In [None]:
utils.imgs.plot_sample_preds(tst_fpaths, val_probs > 0.5, tst_targs, labels, shuffle=True)

In [None]:
pred_fpath = predictions.get_prediction_fpath(basename='my_exp', dset=c.VAL)
_ = predictions.save_or_append_pred_to_file(pred_fpath, val_probs)

In [None]:
thresh = TRAIN_CONFIG['threshold']
acc = metric_utils.get_accuracy(val_probs > thresh, val_targs)
f2 = metric_utils.get_metric_in_blocks(val_probs > thresh, val_targs, 
                                       1000, metric_utils.get_f2_score)
loss = metric_utils.get_cross_entropy_loss(val_probs, val_targs)
print("Acc",acc,"F2",f2,"BCE",loss)
utils.imgs.plot_sample_preds(val_fpaths, val_probs > 0.5, val_targs, 
                             dogscats.LABEL_NAMES)

In [None]:
%time tst_probs = predictions.get_probabilities(model, tst_loader)

In [None]:
pred_fpath = predictions.get_prediction_fpath(basename='my_exp', dset=c.TEST)
_ = predictions.save_or_append_pred_to_file(pred_fpath, tst_probs)
tst_probs = predictions.load_pred(pred_fpath, numpy=True)
utils.imgs.plot_sample_preds(tst_fpaths, tst_probs > thresh, 
                             None, dogscats.LABEL_NAMES)

## Evaluate

In [None]:
# Review Experiment
exp_name = 'BaselineSimpleResnet-Adam-ScheduledLR-img256-lr0.001-wd0.0005-bs64-fold4K-id84E8D'
exp = Experiment(exp_name, cfg.PATHS['experiments'])
exp.review(verbose=False)
exp.history.plot()

In [None]:
# Load Pred
fname = 'my_exp_val.bc'
thresh = TRAIN_CONFIG['threshold']
probs = predictions.load_pred(os.path.join(cfg.PATHS['predictions'], fname))
preds = predictions.get_predictions(probs, thresh)

In [None]:
# View preds, probs, and targets
eval_df = evaluate.get_evaluate_df(preds, probs, val_targs, 
                                   val_fpaths, dogscats.LABEL_NAMES)
eval_df[:5]

In [None]:
# View preds by label
LABEL = 'dog'
dog_preds_by_targ = evaluate.get_preds_by_target_label(
    eval_df, LABEL, condensed=False)
dog_preds_by_pred = evaluate.get_preds_by_predicted_label(
    eval_df, LABEL, condensed=False)
dog_preds_by_targ[:5]

In [None]:
# View preds by probability

# Confident dogs and right (TP)
confident_dogs_tp = evaluate.get_preds_by_target_and_prob(
    eval_df, 'dog', 1, p_min=0.9, p_max=1.0)
# Confident dogs and wrong (FP)
confident_dogs_fp = evaluate.get_preds_by_target_and_prob(
    eval_df, 'dog', 0, p_min=0.9, p_max=1.0)
# Unconfident dogs and right (TN)
unconfident_dogs_tn = evaluate.get_preds_by_target_and_prob(
    eval_df, 'dog', 0, p_min=0.0, p_max=0.1)
# Unconfident dogs and wrong (FN)
unconfident_dogs_fn = evaluate.get_preds_by_target_and_prob(
    eval_df, 'dog', 1, p_min=0.0, p_max=0.1)

# Annotation errors?
evaluate.plot_predictions(unconfident_dogs_fn, dogscats.LABEL_NAMES)
unconfident_dogs_fn

In [None]:
# View predictions and probabilities
evaluate.plot_predictions(eval_df, dogscats.LABEL_NAMES)

In [None]:
# Confusion Matrix
evaluate.plot_label_level_cms(eval_df, dogscats.LABEL_NAMES)

In [None]:
# Plot ROC Curve
evaluate.plot_roc_curve(np.array(probs), val_targs)

In [2]:
create_project('example_data', 'example_data', {'thing'})

NameError: name 'create_project' is not defined