In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch
from collections import defaultdict
from sklearn.svm import SVC
from tqdm import tqdm
from scipy.optimize import brute, fmin

from src.utils.utils import MyLightningCLI, TrainerWandb, get_X_y_groups, PenalizedBinaryFBetaScore
from src.scripts.runs_info import get_runs_info

In [3]:
max_epochs = 200
best_pac_params = f'--data.init_args.batch_size=32 --data.init_args.sampler=weighted_upsampling --model.init_args.attention_dropout=0.1101479509435598 --model.init_args.attention_hidden_dim=64 --model.init_args.attention_num_heads=8 --model.init_args.label_smoothing=0.19365209170598277 --model.lr=0.0008549720251132047 --trainer.max_epochs={max_epochs}'

In [4]:
def optimize_weights(probas: np.ndarray, target: np.ndarray, step=0.01):
    """"Maximize the penalized F1 score of weighted average of probas"""
    
    def f(weights):
        weights = np.append(weights, 1 - weights.sum())
        scorer = PenalizedBinaryFBetaScore('soft', beta=1.0)
        wa_proba = (torch.tensor(probas) * torch.tensor(weights)[None, :]).sum(1)
        scorer.update(wa_proba, torch.tensor(target))
        return -scorer.compute().item()
    
    ranges = (slice(0, 1, step),) * (probas.shape[1] - 1)
    argmax_score_w, max_score, *_ = brute(f, ranges=ranges, finish=fmin, full_output=True)
    
    return -max_score, argmax_score_w

In [None]:
def oldest_checkpoint(filenames):
    # format is epoch={epoch}-step={step}.ckpt
    # get path with largest step
    return sorted(filenames, key=lambda x: int(x.split('=')[2].split('.')[0]))[-1]

runs_info = get_runs_info('./visiomel', './wandb')
fold_to_ckpt_path = defaultdict(dict)
for run_info in runs_info:
    fold_to_ckpt_path[run_info['fold_index_test']][run_info['fold_index']] = oldest_checkpoint(
        run_info['checkpoint_paths']
    )

In [None]:
outer_cv_results = defaultdict(dict)
for outer_fold_index in tqdm(range(5)):
    data = defaultdict(dict)
    for inner_fold_index in tqdm(range(5)):
        cli = MyLightningCLI(
            trainer_class=TrainerWandb, 
            save_config_kwargs={
                'config_filename': 'config_pl.yaml',
                'overwrite': True,
            },
            args=[
                '--config', '/workspace/visiomel-2023/run/configs/swin-patch-attention-classifier.yaml',
                *best_pac_params.split(),
                '--data.init_args.k', '5',
                '--data.init_args.fold_index', f'{inner_fold_index}',
                '--data.init_args.k_test', '5',
                '--data.init_args.fold_index_test', f'{outer_fold_index}',
                '--data.init_args.num_workers', '10',
                '--data.init_args.num_workers_saturated', '10',
                '--ckpt_path', fold_to_ckpt_path[outer_fold_index][inner_fold_index],
            ],
            run=False,
        )

        # Patch attention model
        y_proba = torch.softmax(
            torch.concat(
                cli.trainer.predict(
                    datamodule=cli.datamodule, 
                    return_predictions=True
                ), 
                dim=0
            ).float(), 
            dim=1
        ).numpy()
        data['pac'][inner_fold_index] = y_proba[:, 1]

    # Single SVC model on embeddings on outer train data
    clf = SVC(kernel='linear', C=0.01, probability=True, class_weight='balanced', random_state=0)
    # train + val & test are same for each inner fold, so just use last cli here
    X, y, _ = get_X_y_groups(pd.concat([cli.datamodule.train_dataset.data, cli.datamodule.val_dataset.data]))
    X_test, y_test, _ = get_X_y_groups(cli.datamodule.test_dataset.data)
    clf.fit(X, y)
    y_proba = clf.predict_proba(X_test)
    data['svc'][-1] = y_proba[:, 1]

    # Ground truth
    data['gt'][-1] = y_test

    # Ensemble
    df = pd.DataFrame(
        {
            'pac': sum(data['pac'].values()) / len(data['pac']),
            'svc': data['svc'][-1],
            'gt': data['gt'][-1],
        }
    )
    max_score, argmax_score_w = optimize_weights(df[['pac', 'svc']].values, df['gt'].values)
    outer_cv_results[outer_fold_index]['max_score'] = max_score
    outer_cv_results[outer_fold_index]['argmax_score_w'] = argmax_score_w
    outer_cv_results[outer_fold_index]['df'] = df
    outer_cv_results[outer_fold_index]['data'] = data

In [None]:
for outer_fold_index in range(1):
    print(outer_cv_results[outer_fold_index]['max_score'], outer_cv_results[outer_fold_index]['argmax_score_w'])

0.42307692766189575 0.8888888888888888
