# Training and evaluating classifiers

NOTE: You cannot take advantage of parallel processing with GPUs from within a Jupyter notebook so tasks requiring or benefitting from multiple GPUs should use .py scripts instead.

### Import libraries

Always the first step.

In [4]:
# Set environment variables with os package
import os
os.environ['SF_BACKEND'] = 'torch' # Alternative is 'tensorflow'
os.environ['SF_SLIDE_BACKEND'] = 'cucim' # Alternative is 'libvips'
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Set which GPU(s) to use 

# Check if GPU is available
if os.environ['SF_BACKEND']=='torch':
    import torch
    print('GPU available: ', torch.cuda.is_available())
    print('GPU count: ', torch.cuda.device_count())
    print('GPU current: ', torch.cuda.current_device())
    print('GPU name: ', torch.cuda.get_device_name(torch.cuda.current_device()))
elif os.environ['SF_BACKEND']=='tensorflow':
    import tensorflow as tf
    print("GPU: ", len(tf.config.list_physical_devices('GPU')))

# import slideflow
import slideflow as sf
from slideflow import simclr
from slideflow.slide import qc

# Set verbose logging
import logging
logging.getLogger('slideflow').setLevel(logging.INFO)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '10'
import sys
sys.stderr = sys.__stdout__

# Check if slideflow was properly installed
sf.about()

In [None]:
# Quick and easy project setup on Randi

# # Set root paths
# username = "skochanny" # change me
# root_path = f'/scratch/{username}/PROJECTS'
# labshare_path = '/gpfs/data/pearson-lab/'
# project_name = "TEST_PROJECT"
# relative_annotation_path = 'DL_OTHER/TEST_PROJECTS/lung-adeno-v-squam/annotations.csv' # do not have leading / i.e. "/DL_OTHER..." it messes up os.path.join
# relative_slide_path = 'DL_OTHER/TEST_PROJECTS/lung-adeno-v-squam/slides'
# relative_roi_path = 'DL_OTHER/TEST_PROJECTS/lung-adeno-v-squam/roi'

# # Create a new project, if one does not already exist
# project_root_path = os.path.join(root_path, project_name)
# project = sf.create_project(
#         root = project_root_path,
#         annotations = os.path.join(labshare_path, relative_annotation_path),
#         name = 'LUADvsLUSC', # if you already have a created datasets.json, you can put the source name here
#         slides = os.path.join(labshare_path, relative_slide_path),
#         # rois = os.path.join(labshare_path, relative_roi_path),
#         tiles = os.path.join(project_root_path, "tiles"),
#         tfrecords = os.path.join(project_root_path, "tfrecords")
#     )

# # Notes:
# # - There is an argument ```rois```, which is broken, it wants to ROIs to be a tar.gz file instead of a directory, you need to manually edit the datasets.json file afterwards.
# # - Last time I did this, the ```name``` arg for the project name didn't work and I had to manually edit that as well. 
# # - I used ```os.path.join()``` below but you can also use ```f"{}"``` to format strings.

<a id='import'></a>
### Getting Started with a Slideflow Project

We are starting this tutorial under the assumption that you have already initialized a slideflow project. Once the project has been created and you have specified the paths to datasets, annotation files, etc. we will begin by initializing a Slideflow Project object.

In [None]:
# Set root paths
username = "skochanny"
root_path = f'/scratch/{username}/PROJECTS'
labshare_path = '/gpfs/data/pearson-lab/'
project_name = "TEST_PROJECT"
project_root_path = f"{root_path}/{project_name}"

Make the Project class object. 

In [None]:
# Be sure to check that the project path is correct
P = sf.Project(project_root_path)

## Option 1: Training with a Project

### Full code

You should have your Project set up already.

In [None]:
# 1. import libraries & set vars
import os
import slideflow as sf
os.environ['SF_BACKEND'] = 'torch'

# 2. initialize project
project_root = '/home/pearsonlab/DATA/PROJECTS/TEST_PROJECT'
P = sf.Project(root=project_root)

# 3. set hyperparameters
hp = sf.ModelParams(
    tile_px=299, 
    tile_um=302, 
    epochs=[5], # will save a model and results after each epoch in the list (i.e. [1,3,5]) 
    toplayer_epochs=0, 
    model='xception', 
    pooling='avg',
    loss='sparse_categorical_crossentropy', 
    learning_rate=0.0001, 
    learning_rate_decay=0.98,
    learning_rate_decay_steps=512, 
    batch_size=64, # typically you want as large of a batch size as you can fit in memory
    hidden_layers=1, 
    hidden_layer_width=500,
    optimizer='Adam', 
    early_stop=True, # prevents overfitting, highly recommended
    early_stop_patience=0, # you can set what epoch you want to allow for early stopping
    early_stop_method='accuracy',
    manual_early_stop_epochs=None,
    manual_early_stop_batch=None,
    training_balance='category', 
    validation_balance='none', 
    trainable_layers=0, 
    l1=0, 
    l2=0, 
    l1_dense=None,
    l2_dense=None, 
    dropout=0.2, 
    uq=False, # uncertainty quantification, adds much more time to training
    augment='xyrjb', # all random augmentations: x=horizontal flip, y=vertical flip, r=rotate, j=jpeg compression, b=Gaussian blur, n=Stain Normalizer 
    normalizer=None, # 'reinhard_fast' is usually best
    normalizer_source=None,
    include_top=False, 
    drop_images=False)

# 4. train model, will save in project models/ folder
results = P.train(
    outcomes="tumor_type",
    exp_label='test_tutorial', # this will become part of the name of the resulting trained model folder. Saved model name will be format "00001-exp_label-outcome-epoch1"
    filters={"dataset": ["train"], "exclude": ["no"]},
    pretrain='imagenet',
    save_predictions=True, 
    params=hp,
    # if doing k-fold cross validation
    val_strategy='k-fold',
    val_k_fold=3,
    validate_on_batch=100 # this determines how often a validation step occurs. You can mess with this to get better early stopping results, depends on size of dataset
    )

# 5. evaluate model, will save in project eval/ folder
P.evaluate(
    model="/path/to/trained_model_epoch1",
    outcomes="tumor_type",
    filters={"dataset": ["test"]}
)

If you have issues with downloading the Xception model during model initialization, you can add the following line to the beginning of the script to ignore the certificate to download the model:

In [None]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

Alternatively, you can provide a dataset to the ```P.train()``` instead of using filters on the Project.

In [None]:
dataset = P.dataset(tile_px=299, tile_um=302)
dataset = dataset.filter({"exclude": ["no"]})

# Option 1: use filters
train_dataset = dataset.filter({"dataset": ["train"]})
val_dataset = dataset.filter({"dataset": ["val"]})
test_dataset = dataset.filter({"dataset": ["test"]})

# Option 2: use split
train_dataset, val_dataset, test_dataset = dataset.split(
    train=0.8, val=0.1, test=0.1
)

# Train
results = P.train(
    outcomes="tumor_type",
    params=hp,
    dataset=train_dataset,
    val_dataset=val_dataset,
)

# Evaluate
P.evaluate(
  model="/path/to/trained_model_epoch1",
  outcomes="tumor_type",
  dataset=test_dataset
)

## Option 2: Training with a Trainer

There are still bugs with this but we are working on it. Also this option is more complicated and not recommended for beginners but if you need to do more advanced things, you can use this option.

In [None]:
# Build a dataset
import os
import slideflow as sf
from pprint import pprint
os.environ['SF_BACKEND'] = 'torch'
tiles_savedir = '/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/extracted_cells_tiles/'
tfr_savedir = '/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/extracted_cells_tfr/ALL/'
tile_px = 96
tile_um = '40x'
tile_file_format = 'jpg'

# Create a dataset
dataset = sf.Dataset(
    config='/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/cell_classifier/datasets.json',
    sources=['UCH_ENDOMETRITIS_EVAL_YOLO', 'UCH_ENDOMETRITIS_NEG_YOLO', 'UCH_ENDOMETRITIS_LOW_YOLO'],
    annotations='/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/bennett_plasmacells_anns.csv',
    # filters={'group': 'eval'}, # You can provide filters to the dataset or you can 
    tile_px=tile_px,
    tile_um=tile_um,
)
dataset.summary()

In [None]:
# Get labels
labels, unique_labels = dataset.labels('cell_class')

# prep dataset splits
# NOTE I was splitting them randomly but that would split the patients up and I may get no negatives in my validation set and it was a problem
(train_ds, val_ds) = dataset.filter({'group': 'train'}).split(model_type='categorical', labels='cell_class', val_strategy='fixed', val_fraction=0.3)
train_ds = dataset.filter({'patient': ['NPC_Control_1', 'PC_Control_1', 'NPC_Control_2', 'PC_Control_2']})
val_ds = dataset.filter({'patient': ['NPC_Control_3', 'PC_Control_3', 'NPC_Control_4', 'PC_Control_4']})
test_ds = dataset.filter({'group': 'test'})
eval_ds = dataset.filter({'slide': ['LowP1', 'LowP2', 'LowP3', 'LowP4']})

In [None]:
from os.path import basename
models_dir = '/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/cell_classifier/models'
eval_dir = '/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/cell_classifier/eval'
model_name = "test_5e_bs64_vob40_uq"
exp_label = "Lows"
outdir = f'{eval_dir}/eval-{model_name}-{exp_label}'
model_path = '/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/cell_classifier/models/test_5e_bs64_vob40_uq/test_5e_bs64_vob40_uq_epoch4'

# read in params.json and edit to add in the info we need
config = sf.util.get_model_config(model_path)
config['outcomes'] = ['cell_class']
config['outcome_labels'] = {"0": "NPC", "1": "PC"}
config["input_features"] = None
config["input_feature_sizes"] = None
config["input_feature_labels"] = None
config["model_type"] = "categorical"
sf.util.write_json(config, os.path.join(model_path, 'params.json'))
# also save in the higher directory
sf.util.write_json(config, os.path.join('/home/pearsonlab/PROJECTS/CHRONIC_ENDOMETRITIS/data/cells/cell_classifier/models/test_5e_bs64_vob40_uq', 'params.json'))

# Build trainer instead & then load model
config = sf.util.get_model_config(model_path)
hp = sf.ModelParams.from_dict(config['hp'])
trainer = sf.model.Trainer(hp=hp,
                 outdir=outdir,
                 labels=labels,
                 outcome_names=['cell_class'],
                 name=f"eval-{model_name}-{exp_label}",
                 config=config,
                 load_method='weights',
                 )
trainer.load(model_path)

# Evaluate
results_dict = trainer.evaluate(eval_ds,
                                batch_size=64,
                                save_predictions='csv',
                                uq=True,
                                )