# Training and evaluating classifiers

NOTE: You cannot take advantage of parallel processing with GPUs from within a Jupyter notebook so tasks requiring or benefitting from multiple GPUs should use .py scripts instead.

## Option 1: Training with a Project

### Full code

You should have your Project set up already.

In [None]:
# 1. import libraries & set vars
import os
import slideflow as sf
os.environ['SF_BACKEND'] = 'torch'

# 2. initialize project
project_root = '/home/pearsonlab/DATA/PROJECTS/TEST_PROJECT'
P = sf.Project(root=project_root)

# 3. set hyperparameters
hp = sf.ModelParams(
    tile_px=299, 
    tile_um=302, 
    epochs=[5], # will save a model and results after each epoch in the list (i.e. [1,3,5]) 
    toplayer_epochs=0, 
    model='xception', 
    pooling='avg',
    loss='sparse_categorical_crossentropy', 
    learning_rate=0.0001, 
    learning_rate_decay=0.98,
    learning_rate_decay_steps=512, 
    batch_size=64, # typically you want as large of a batch size as you can fit in memory
    hidden_layers=1, 
    hidden_layer_width=500,
    optimizer='Adam', 
    early_stop=True, # prevents overfitting, highly recommended
    early_stop_patience=0, # you can set what epoch you want to allow for early stopping
    early_stop_method='accuracy',
    manual_early_stop_epochs=None,
    manual_early_stop_batch=None,
    training_balance='category', 
    validation_balance='none', 
    trainable_layers=0, 
    l1=0, 
    l2=0, 
    l1_dense=None,
    l2_dense=None, 
    dropout=0.2, 
    uq=False, # uncertainty quantification, adds much more time to training
    augment='xyrjb', # all random augmentations: x=horizontal flip, y=vertical flip, r=rotate, j=jpeg compression, b=Gaussian blur, n=Stain Normalizer 
    normalizer=None, # 'reinhard_fast' is usually best
    normalizer_source=None,
    include_top=False, 
    drop_images=False)

# 4. train model, will save in project models/ folder
results = P.train(
    outcomes="tumor_type",
    exp_label='test_tutorial', # this will become part of the name of the resulting trained model folder. Saved model name will be format "00001-exp_label-outcome-epoch1"
    filters={"dataset": ["train"], "exclude": ["no"]},
    pretrain='imagenet',
    save_predictions=True, 
    params=hp,
    # if doing k-fold cross validation
    val_strategy='k-fold',
    val_k_fold=3,
    validate_on_batch=100 # this determines how often a validation step occurs. You can mess with this to get better early stopping results, depends on size of dataset
    )

# 5. evaluate model, will save in project eval/ folder
P.evaluate(
    model="/path/to/trained_model_epoch1",
    outcomes="tumor_type",
    filters={"dataset": ["test"]}
)

Alternatively, you can provide a dataset to the ```P.train()``` instead of using filters on the Project.

In [None]:
dataset = P.dataset(tile_px=299, tile_um=302)
dataset = dataset.filter({"exclude": ["no"]})

# Option 1: use filters
train_dataset = dataset.filter({"dataset": ["train"]})
val_dataset = dataset.filter({"dataset": ["val"]})
test_dataset = dataset.filter({"dataset": ["test"]})

# Option 2: use split
train_dataset, val_dataset, test_dataset = dataset.split(
    train=0.8, val=0.1, test=0.1
)

# Train
results = P.train(
    outcomes="tumor_type",
    params=hp,
    dataset=train_dataset,
    val_dataset=val_dataset,
)

# Evaluate
P.evaluate(
  model="/path/to/trained_model_epoch1",
  outcomes="tumor_type",
  dataset=test_dataset
)

### Walkthrough

## Option 2: Training with a Trainer

There are still bugs with this but we are working on it. 

In [None]:
# Build a dataset
import os
import slideflow as sf
from pprint import pprint
os.environ['SF_BACKEND'] = 'torch'
tiles_savedir = '/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/extracted_cells_tiles/'
tfr_savedir = '/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/extracted_cells_tfr/ALL/'
tile_px = 96
tile_um = '40x'
tile_file_format = 'jpg'

# Create a dataset
dataset = sf.Dataset(
    config='/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/cell_classifier/datasets.json',
    sources=['UCH_ENDOMETRITIS_EVAL_YOLO', 'UCH_ENDOMETRITIS_NEG_YOLO', 'UCH_ENDOMETRITIS_LOW_YOLO'],
    annotations='/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/bennett_plasmacells_anns.csv',
    # filters={'group': 'eval'}, # You can provide filters to the dataset or you can 
    tile_px=tile_px,
    tile_um=tile_um,
)
dataset.summary()

In [None]:
# Get labels
labels, unique_labels = dataset.labels('cell_class')

# prep dataset splits
# NOTE I was splitting them randomly but that would split the patients up and I may get no negatives in my validation set and it was a problem
(train_ds, val_ds) = dataset.filter({'group': 'train'}).split(model_type='categorical', labels='cell_class', val_strategy='fixed', val_fraction=0.3)
train_ds = dataset.filter({'patient': ['NPC_Control_1', 'PC_Control_1', 'NPC_Control_2', 'PC_Control_2']})
val_ds = dataset.filter({'patient': ['NPC_Control_3', 'PC_Control_3', 'NPC_Control_4', 'PC_Control_4']})
test_ds = dataset.filter({'group': 'test'})
eval_ds = dataset.filter({'slide': ['LowP1', 'LowP2', 'LowP3', 'LowP4']})

In [None]:
from os.path import basename
models_dir = '/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/cell_classifier/models'
eval_dir = '/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/cell_classifier/eval'
model_name = "test_5e_bs64_vob40_uq"
exp_label = "Lows"
outdir = f'{eval_dir}/eval-{model_name}-{exp_label}'
model_path = '/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/cell_classifier/models/test_5e_bs64_vob40_uq/test_5e_bs64_vob40_uq_epoch4'

# read in params.json and edit to add in the info we need
config = sf.util.get_model_config(model_path)
config['outcomes'] = ['cell_class']
config['outcome_labels'] = {"0": "NPC", "1": "PC"}
config["input_features"] = None
config["input_feature_sizes"] = None
config["input_feature_labels"] = None
config["model_type"] = "categorical"
sf.util.write_json(config, os.path.join(model_path, 'params.json'))
# also save in the higher directory
sf.util.write_json(config, os.path.join('/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/cell_classifier/models/test_5e_bs64_vob40_uq', 'params.json'))

# Build trainer instead & then load model
config = sf.util.get_model_config(model_path)
hp = sf.ModelParams.from_dict(config['hp'])
trainer = sf.model.Trainer(hp=hp,
                 outdir=outdir,
                 labels=labels,
                 outcome_names=['cell_class'],
                 name=f"eval-{model_name}-{exp_label}",
                 config=config,
                 load_method='weights',
                 )
trainer.load(model_path)

# Evaluate
results_dict = trainer.evaluate(eval_ds,
                                batch_size=64,
                                save_predictions='csv',
                                uq=True,
                                )