In [8]:
#!/usr/bin/env python3
"""Utilities for cross-validation.
Notice data/folds-10.pkl we use in 10-fold cross-val. Keep it to replicate our results"""

import numpy as np
import glob
from os.path import basename, join
from sklearn.model_selection import StratifiedKFold
import pickle


def load_data(in_dir, folds=None, split=None):
    """Builds train/test data from preprocessed features for a given split

    # Arguments
        in_dir: Input directory containing *.npy CNN feature files.
        folds: None or list of splits dict{
                                             "train": {
                                                         "x": train files list,
                                                         "y": train labels},
                                             "test": {
                                                         "x": test files list,
                                                         "y": test labels}}
                                        }
        split: None or split number.
    # Returns
        Tran/test data (features and labels) for a given split, if `folds` is not None
        Test data (only features) and file names, if `folds` is None
    """
    if folds:
        y_train = []
        x_train = []
        for f, l in zip(folds[split]["train"]["x"], folds[split]["train"]["y"]):
            x = np.load(join(in_dir, f))
            x_train.append(x)
            y_train.append([l] * len(x))
        x_train = np.vstack(x_train)
        y_train = np.concatenate(y_train)

        y_test = []
        x_test = []
        for f, l in zip(folds[split]["test"]["x"], folds[split]["test"]["y"]):
            x = np.load(join(in_dir, f))
            x_test.append(x)
            y_test.append([l] * len(x))
        x_test = np.vstack(x_test)
        y_test = np.concatenate(y_test)

        return x_train, y_train, x_test, y_test
    else:
        files = glob.glob(in_dir + "/*.npy")
        x = []
        for f in files:
            x.append(np.load(f))
        return np.vstack(x), np.array([basename(f) for f in files])


def make_folds():
    """Creates stratified splits based on train directory listing

    # Dumps
        folds: list of splits dict{
                                   "train": {
                                               "x": train files list,
                                               "y": train labels},
                                   "test": {
                                               "x": test files list,
                                               "y": test labels}}
                                }
    """
    files = np.array([basename(f) for f in glob.glob("TempDataset\Features\*.npy")])
    labels = []
    classes = np.array([0, 1])
    for f in files:
        print(f)
        lb = np.array([f.startswith("normal"),
                       f.startswith("cancer")])
        labels.append(classes[np.argmax(lb)])
    labels = np.array(labels)

    folds = []
    skf = StratifiedKFold(n_splits=10, shuffle=True)
    for train_index, test_index in skf.split(files, labels):
        
        f_train, f_test = files[train_index], files[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        folds.append({"train": {"x": f_train, "y": y_train}, "test": {"x": f_test, "y": y_test}})

    with open("TempDataset/folds-10.pkl", "wb") as f:
        pickle.dump(folds, f)


In [9]:
import os
files = np.array([basename(f) for f in glob.glob("TempDataset\Class1\Features\*.npy")])
for filename in os.listdir("TempDataset\Class1\Features\\"): 
        dst ="cancer_" + filename 
        src ="TempDataset\Class1\Features\\"+ filename 
        dst ="TempDataset\Class1\Features\\"+ dst 
          
        # rename() function will 
        # rename all the files 
        os.rename(src, dst) 


In [19]:
#make_folds()
CROP_SIZES = [224]
SCALES = [0.5]
NN_MODELS = ["ResNet"]

AUGMENTATIONS_PER_IMAGE = 50
NUM_CLASSES = 4
RANDOM_STATE = 1
N_SEEDS = 5
VERBOSE_EVAL = False
with open("TempDataset/folds-10.pkl", "rb") as f:
    FOLDS = pickle.load(f)

LGBM_MODELS_ROOT = "LGBMs"
CROSSVAL_PREDICTIONS_ROOT = "predictions"


def _mean(x, mode="arithmetic"):
    """
    Calculates mean probabilities across augmented data
    # Arguments
        x: Numpy 3D array of probability scores, (N, AUGMENTATIONS_PER_IMAGE, NUM_CLASSES)
        mode: type of averaging, can be "arithmetic" or "geometric"
    # Returns
        Mean probabilities 2D array (N, NUM_CLASSES)
    """
    assert mode in ["arithmetic", "geometric"]
    if mode == "arithmetic":
        x_mean = x.mean(axis=1)
    else:
        x_mean = np.exp(np.log(x + 1e-7).mean(axis=1))
        x_mean = x_mean / x_mean.sum(axis=1, keepdims=True)
    return x_mean


In [23]:
    import pickle
    import numpy as np
    import lightgbm as lgb
    from sklearn.metrics import accuracy_score
    from os.path import join, exists
    from os import makedirs
    import argparse

    
    learning_rate = 0.1
    num_round = 70
    param = {
        "objective": "multiclass",
        "num_class": NUM_CLASSES,
        "metric": ["multi_logloss", "multi_error"],
        "verbose": 0,
        "learning_rate": learning_rate,
        "num_leaves": 191,
        "feature_fraction": 0.46,
        "bagging_fraction": 0.69,
        "bagging_freq": 0,
        "max_depth": 7,
    }
    PREPROCESSED_ROOT = "TempDataset\Features"
    for SCALE in SCALES:
        print("SCALE:", SCALE)
        for NN_MODEL in NN_MODELS:
            print("NN_MODEL:", NN_MODEL)
            for CROP_SZ in CROP_SIZES:
                print("PATCH_SZ:", CROP_SZ)
                INPUT_DIR = PREPROCESSED_ROOT
                acc_all_seeds = []
                for seed in range(N_SEEDS):
                    accuracies = []
                    for fold in range(len(FOLDS)):
                        feature_fraction_seed = RANDOM_STATE + seed * 10 + fold
                        bagging_seed = feature_fraction_seed + 1
                        param.update({"feature_fraction_seed": feature_fraction_seed, "bagging_seed": bagging_seed})

                        print("Fold {}/{}, seed {}".format(fold + 1, len(FOLDS), seed))
                        x_train, y_train, x_test, y_test = load_data(INPUT_DIR, FOLDS, fold)
                        train_data = lgb.Dataset(x_train, label=y_train)
                        test_data = lgb.Dataset(x_test, label=y_test)
                        gbm = lgb.train(param, train_data, num_round, valid_sets=[test_data], verbose_eval=VERBOSE_EVAL)

                        # pickle model
                        model_file = "lgbm-{}-{}-{}-f{}-s{}.pkl".format(NN_MODEL, SCALE, CROP_SZ, fold, seed)
                        model_root = join(LGBM_MODELS_ROOT, NN_MODEL)
                        if not exists(model_root):
                            makedirs(model_root)
                        with open(join(model_root, model_file), "wb") as f:
                            pickle.dump(gbm, f)

                        scores = gbm.predict(x_test)
                        scores = scores.reshape(-1, AUGMENTATIONS_PER_IMAGE, NUM_CLASSES)
                        preds = {
                            "files": FOLDS[fold]["test"]["x"],
                            "y_true": y_test,
                            "scores": scores,
                        }
                        preds_file = "lgbm_preds-{}-{}-{}-f{}-s{}.pkl".format(NN_MODEL, SCALE, CROP_SZ,
                                                                              fold, seed)
                        preds_root = join(CROSSVAL_PREDICTIONS_ROOT, NN_MODEL)
                        if not exists(preds_root):
                            makedirs(preds_root)
                        with open(join(preds_root, preds_file), "wb") as f:
                            pickle.dump(preds, f)

                        mean_scores = _mean(scores, mode="arithmetic")
                        y_pred = np.argmax(mean_scores, axis=1)
                        y_true = y_test[::AUGMENTATIONS_PER_IMAGE]
                        acc = accuracy_score(y_true, y_pred)
                        print("Accuracy:", acc)
                        accuracies.append(acc)

                    acc_seed = np.array(accuracies).mean()  # acc of a seed
                    acc_all_seeds.append(acc_seed)
                    print("{}-{}-{} Accuracies: [{}], mean {:5.3}".format(NN_MODEL, SCALE, CROP_SZ,
                                                                          ", ".join(map(lambda s: "{:5.3}".format(s), accuracies)),
                                                                          acc_seed))
                print("Accuracy of all seeds {:5.3}".format(np.array(acc_all_seeds).mean()))


SCALE: 0.5
NN_MODEL: ResNet
PATCH_SZ: 224
Fold 1/10, seed 0
Accuracy: 0.5714285714285714
Fold 2/10, seed 0
Accuracy: 0.5714285714285714
Fold 3/10, seed 0
Accuracy: 1.0
Fold 4/10, seed 0
Accuracy: 1.0
Fold 5/10, seed 0
Accuracy: 0.8333333333333334
Fold 6/10, seed 0
Accuracy: 0.8333333333333334
Fold 7/10, seed 0
Accuracy: 0.8333333333333334
Fold 8/10, seed 0
Accuracy: 0.6666666666666666
Fold 9/10, seed 0
Accuracy: 1.0
Fold 10/10, seed 0
Accuracy: 1.0
ResNet-0.5-224 Accuracies: [0.571, 0.571,   1.0,   1.0, 0.833, 0.833, 0.833, 0.667,   1.0,   1.0], mean 0.831
Fold 1/10, seed 1
Accuracy: 0.5714285714285714
Fold 2/10, seed 1
Accuracy: 0.5714285714285714
Fold 3/10, seed 1


KeyboardInterrupt: 