# Tutorial for multiclass classification on continuous variables using _ImaGene_

In this example, the aim is to estimate the selection coefficient on a given _locus_ using multiclass classification.
Please refer to the tutorial for binay classification for an in-depth explanation of each step.
Please also refer to the main paper for the rational behind this approach.

In [None]:
import os
import gzip
import _pickle as pickle

import numpy as np
import scipy.stats

import skimage.transform
from keras import models, layers, activations, optimizers, regularizers
from keras.utils import plot_model
from keras.models import load_model

import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [None]:
%run -i ImaGene.py

In [None]:
i = 1
while i <= 10:

    # simulations from one-epoch demographic model
    myfile = ImaFile(simulations_folder='/home/mfumagal/Data/ImaGene/Continuous/Simulations' + str(i) + '.Epoch1', nr_samples=128, model_name='Marth-1epoch-CEU')

    # only 100 replicates per class since I simulated many classes here, all discrete values from 0 to 400
    mygene = myfile.read_simulations(parameter_name='selection_coeff_hetero', max_nrepl=100)
    if i == 1:
        mygene.summary()
    
    # manipulate data
    mygene.filter_freq(0.02)
    mygene.sort('rows_freq')
    mygene.sort('cols_freq')
    mygene.resize((128, 128))
    mygene.convert(verbose=False)
    
    # we assign 11 classes out of all the data simulated
    mygene.set_classes(nr_classes=11)
    if i == 1:
        print(mygene.classes)
    # and we assign targets corresponding to the previously set classes 
    mygene.set_targets()
    
    # randomise data
    mygene.subset(get_index_random(mygene))

    # targets have to be converted into categorical data; here we can use some extra options to, for instance, impose a Guassian distribution on the true targets
    mygene.targets = to_categorical(mygene.targets, wiggle=0, sd=0.5)
    
    # at first iteration we build the model 
    # note that, as an illustration, we don't implement a final fully-connected layer
    if i == 1:

        model = models.Sequential([
                    layers.Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.005, l2=0.005), padding='valid', input_shape=mygene.data.shape[1:4]),
                    layers.MaxPooling2D(pool_size=(2,2)),
                    layers.Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.005, l2=0.005), padding='valid'),
                    layers.MaxPooling2D(pool_size=(2,2)),
                    layers.Conv2D(filters=128, kernel_size=(3,3), strides=(1,1), activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.005, l2=0.005), padding='valid'),
                    layers.MaxPooling2D(pool_size=(2,2)),
                    layers.Flatten(),
                    layers.Dense(units=len(mygene.classes), activation='softmax')])
        model.compile(optimizer='adam',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])

        mynet = ImaNet(name='[C32+P]+[C64+P]+[C128+P]')

    # training for iterations from 1 to 9
    print(i)
    if i < 10:
        score = model.fit(mygene.data, mygene.targets, batch_size=32, epochs=1, verbose=1, validation_split=0.10)
        mynet.update_scores(score)
    else:
        # testing for iteration 10
        mynet.test = model.evaluate(mygene.data, mygene.targets, batch_size=None, verbose=1)
        mynet.predict(mygene, model)

    i += 1

In [None]:
# save final (trained) model
model.save('Data/model.cont.h5')

# save testing data
mygene.save('Data/mygene.cont')

# save network
mynet.save('Data/mynet.cont')

In [None]:
# assess the training
mynet.plot_train()

In [None]:
# print the testing results [loss, accuracy]
print(mynet.test)

In [None]:
# plot a confusion matrix (on the last mygene object which represents the testing data)
mynet.plot_cm(mygene.classes)

A plot of the probability distrbution can be easily obtained by, for instance, gathering MCMC samples (using `import pymc3`) followed by a histogram. MCMC samples can be used to obtain Bayes Factors and HPDI. 
However, it is not guaranteed that this approach is better than using a regression as final layer. More tests need to be conducted.