In [1]:
import numpy as np, pandas as pd
import train as t
from src import helper_models as hm, helper_data as hd, helper_tts as htts
from importlib import reload
import tensorflow.keras as keras
import glob




Using TensorFlow backend.


In [2]:
# Reload imports of custom modules, in case I am modifying and reloading them live
reload(t)
reload(hd)
reload(hm)
reload(htts)


<module 'src.helper_tts' from '/mnt/volume2Tb/Dropbox/PROJECTS/MACHINE-LEARNING/AQUASCOPE/plankifier/src/helper_tts.py'>

# Initialize simulation

In [3]:
sim=t.Ctrain()

# Init parameters
Parameters must always be updated through the `UpdateParams()` method

In [4]:
def SetModelParams(kind):
    ''' 
    A quick way to set compatible user parameters of data and model kinds
    
    Input: kind = either 'image', 'feat' or 'mixed'
    Output: model_image, model_feat, datakind, ttkind, aug
    '''
    
    if kind == 'image':
        return ('conv2', None, 'image', 'image', True)
    
    elif kind == 'feat':
        return (None, 'mlp', 'feat', 'feat', False)
    
    elif kind == 'mixed':
        return ('conv2', 'mlp', 'mixed', 'mixed', False)
        

In [5]:
model_image, model_feat, datakind, ttkind, aug = SetModelParams('image')

sim.UpdateParams(
    datapaths=['./data/1_zooplankton_0p5x/training/zooplankton_trainingset_2020.04.28/',
               './data/1_zooplankton_0p5x/training/zooplankton_trainingset_2020.07.06/'],
    outpath     = 'out_example', 
    L           = 128, 
    aug         = aug, 
    model_feat  = model_feat,
    model_image = model_image,
    datakind    = datakind, 
    ttkind      = ttkind,
    class_select= ['chaoborus','bosmina','unknown_plankton'] #None
    )


Create output directory only after you've set the right `outpath`

In [6]:
sim.CreateOutDir()


In [7]:
print(sim.params.datapaths)


['./data/1_zooplankton_0p5x/training/zooplankton_trainingset_2020.04.28/', './data/1_zooplankton_0p5x/training/zooplankton_trainingset_2020.07.06/']


# Load data

In [8]:
# These arguments are the defaults
reload(hd)
sim.LoadData(L=sim.params.L, class_select=sim.params.class_select, datakind=sim.params.datakind) 


datapaths: ['./data/1_zooplankton_0p5x/training/zooplankton_trainingset_2020.04.28/', './data/1_zooplankton_0p5x/training/zooplankton_trainingset_2020.07.06/']
classes from datapaths: ['dinobryon', 'ceratium', 'fragilaria', 'asplanchna', 'polyarthra', 'aphanizomenon', 'keratella_cochlearis', 'kellikottia', 'unknown_plankton', 'hydra', 'daphnia_skins', 'fish', 'rotifers', 'diaphanosoma', 'conochilus', 'diatom_chain', 'uroglena', 'dirt', 'trichocerca', 'synchaeta', 'asterionella', 'unknown', 'copepod_skins', 'daphnia', 'nauplius', 'bosmina', 'maybe_cyano', 'eudiaptomus', 'paradileptus', 'leptodora', 'chaoborus', 'keratella_quadrata', 'cyclops', 'filament']
class: chaoborus (10)
class: bosmina (80)
class: unknown_plankton (71)


# Create sets (test and train)

In [9]:
sim.UpdateParams(testSplit=0.25)
sim.CreateTrainTestSets(ttkind=sim.params.ttkind)


# Create and Train Model
Custom parameter changes are made by acting directly on the params class through the `UpdateParams()` method.

In [10]:
sim.UpdateParams(totEpochs=10)
sim.Train()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training took 0.12042247851689657 minutes
Saving the last model. These are not the best weights, they are the last ones. For the best weights use the callback output (bestweights.hdf5)]


# Classification Report

In [11]:
sim.Report()

                  precision    recall  f1-score   support

         bosmina       0.68      0.88      0.76        24
       chaoborus       0.00      0.00      0.00         2
unknown_plankton       0.50      0.33      0.40        15

        accuracy                           0.63        41
       macro avg       0.39      0.40      0.39        41
    weighted avg       0.58      0.63      0.59        41



  _warn_prf(average, modifier, msg_start, len(result))


We now extract manually the training loss corresponding to the best weights, so that we can make sure that restarting the simulation works

In [12]:
def BestLoss(history):
    ''' Returns the training loss of the point where the validation loss was minimal'''
    return history['loss'][np.argmin(history['val_loss'])]
def InitLoss(history):
    ''' Returns the training loss of the point where the validation loss was minimal'''
    return history['loss'][0]

# initLoss = InitLoss(sim.history.history)
# bestLoss = BestLoss(sim.history.history)

print('Initial training loss:',InitLoss(sim.history.history))
print('Best    training loss:',BestLoss(sim.history.history))


Initial training loss: 1.0787554184595745
Best    training loss: 0.7561219984834845


# Start again from scratch
If we train again, the simulation does not start again where it ended, but it starts from scratch.
Since the default initialization is random, the initial value will be close to that of the previous run, but not the same.

In [13]:
sim.Train()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training took 0.05980354150136312 minutes
Saving the last model. These are not the best weights, they are the last ones. For the best weights use the callback output (bestweights.hdf5)]


In [14]:
print('Initial training loss:',InitLoss(sim.history.history))
print('Best    training loss:',BestLoss(sim.history.history))


Initial training loss: 1.5577392144636675
Best    training loss: 0.853489404374903


# Start from a previous state
We have two ways of not starting from scratch. We can either *(a)* define the full model and load the weight configuration, or *(b)* load the full model. We could also load the full model and then load a different weight configuration *(b+a)*.

I also show how to play around with some input parameters.



## (a) Define model and load weights from disk
We don't update the parameters, because we keep the same ones as in the previous runs, for comparison. The only thing we need to do, before calling `sim.Train()`, is to specify the file with the weight configuration through the `load_weights` parameter.

We will load the weights that minimized the test loss in the previous run.
You will see now that the initial loss is lower value than that of the two previous runs.

In [15]:
# Load the weights
sim.params.load_weights=sim.params.outpath+'/bestweights.hdf5'
sim.Train()

Loading weights from  out_example/bestweights.hdf5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training took 0.06245201826095581 minutes
Saving the last model. These are not the best weights, they are the last ones. For the best weights use the callback output (bestweights.hdf5)]


In [16]:
print('Initial training loss:', InitLoss(sim.history.history))
print('Best    training loss:', BestLoss(sim.history.history))


Initial training loss: 0.8301698890599337
Best    training loss: 0.7404123942057291


## (b) Loading full model from disk (CHECKPOINTING)
This is essentially what we do when checkpointing. At the end of each run, the entire model is saved (the default name is `'keras_model.h5'`). If we want to restart from there, we just need to load that model.

Since in this case we are checkpointing, we also show how to handle the simulation times.
Since when checkpointing one usually is starting a simulation from scratch, we will **define a new Ctrain class that loads all the parameters**.


In [17]:
# Load model, classes and parameters from the end of the previous run
model_from_previous  = sim.params.outpath+'/'+sim.params.saveModelName
params_from_previous = np.load(sim.params.outpath+'/params.npy', allow_pickle=True).item()
classes_from_previous = np.load(sim.params.outpath+'/classes.npy')

In [18]:
# Create a new class, just as if we were starting a new simulation that loaded a previous checkpoint
sim2=t.Ctrain()
sim2.params = params_from_previous
sim2.CreateOutDir()
sim2.LoadData(L=sim.params.L, class_select=sim.params.class_select, datakind=sim.params.datakind) # Should make a deep copy from sim
sim2.CreateTrainTestSets(ttkind=sim.params.ttkind)

datapaths: ['./data/1_zooplankton_0p5x/training/zooplankton_trainingset_2020.04.28/', './data/1_zooplankton_0p5x/training/zooplankton_trainingset_2020.07.06/']
classes from datapaths: ['dinobryon', 'ceratium', 'fragilaria', 'asplanchna', 'polyarthra', 'aphanizomenon', 'keratella_cochlearis', 'kellikottia', 'unknown_plankton', 'hydra', 'daphnia_skins', 'fish', 'rotifers', 'diaphanosoma', 'conochilus', 'diatom_chain', 'uroglena', 'dirt', 'trichocerca', 'synchaeta', 'asterionella', 'unknown', 'copepod_skins', 'daphnia', 'nauplius', 'bosmina', 'maybe_cyano', 'eudiaptomus', 'paradileptus', 'leptodora', 'chaoborus', 'keratella_quadrata', 'cyclops', 'filament']
class: chaoborus (10)
class: bosmina (80)
class: unknown_plankton (71)


**Load model without training it, and make a classification report**. You can see that the model is loaded correctly, since it gives better than random predictions.

In [19]:
sim2.LoadModel(modelfile = model_from_previous, bestweights=sim.params.outpath+'/bestweights.hdf5')
sim2.Report()

                  precision    recall  f1-score   support

         bosmina       0.73      0.92      0.81        24
       chaoborus       0.00      0.00      0.00         2
unknown_plankton       0.64      0.47      0.54        15

        accuracy                           0.71        41
       macro avg       0.46      0.46      0.45        41
    weighted avg       0.66      0.71      0.67        41



  _warn_prf(average, modifier, msg_start, len(result))


Now let's prolongate the run for some more steps.
Since we are prolongating a run, we also want to make sure that the timesteps are consistent

In [20]:
# Set initial epoch to the end of the previous run, and extend the total number of epochs (otherwise it won't run)
n_epochs_new = 20

sim2.params.initial_epoch=sim.history.epoch[-1]+1 if len(sim.history.epoch)>0 else 0
sim2.params.totEpochs=sim2.params.initial_epoch + n_epochs_new

# We do not want the Train() method to load weights from somewhere, 
# so we make sure that load_weights is set to None
# If we wanted to load the weights from somewhere, 
# it would be enough to give it the right file name (usually, 'bestweights.hdf5')
sim2.UpdateParams(load_weights = None)

In [21]:
print('Epochs before running:', sim.history.epoch)
sim2.Train()
print('Epochs after running:', sim2.history.epoch)


Epochs before running: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Training took 0.12333152294158936 minutes
Saving the last model. These are not the best weights, they are the last ones. For the best weights use the callback output (bestweights.hdf5)]
Epochs after running: [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


# Use second model to predict on new, unseen data
Choose a class from the ones the classifier can recognize, and see whether the model is able to recognize it when taking unseen images in the validation dataset.

In [22]:

# We only do this for 'image' models, because the format of the validation directory is wrong and I don't want to cope with that now
if ttkind == 'image':
    target = sim2.params.class_select[1]
    print('target:',target)

    testdir = 'data/1_zooplankton_0p5x/validation/tommy_validation/images/'+target
    im_names=np.array(glob.glob(testdir+'/*.jpeg'),dtype=object)
    npimages=hd.LoadImageList(im_names, L=sim2.params.L, show=False)

    probs=sim2.model.predict(npimages)
    predictions=probs.argmax(axis=1)  # The class that the classifier would bet on

    print('Predictions:',sim2.tt.lb.classes_[predictions])

target: bosmina
Predictions: ['unknown_plankton' 'unknown_plankton' 'bosmina' 'unknown_plankton'
 'bosmina' 'bosmina' 'bosmina' 'bosmina' 'unknown_plankton']
