# Estimation of continuous variables with regression using _ImaGene_

In this example, the aim is to estimate a continuous parameter using regression.
Please refer to the tutorial for binary and multiclass classification for an in-depth explanation of each step and case study, and to the tutorial for multiclass classification on continuous variables for data generation and processing.

In [None]:
import os
import gzip
import _pickle as pickle

import numpy as np
import scipy.stats
import arviz

import tensorflow as tf
from tensorflow import keras
from keras import models, layers, activations, optimizers, regularizers
from keras.utils.vis_utils import plot_model
from keras.models import load_model

import skimage.transform
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import pydot

In [None]:
%run -i ../ImaGene.py

### 1. Read data from VCF file and store it into _ImaGene_ objects

As an illustration, we double sort the image by ordering rows and columns by frequency. We also filter out singletons and do not resize. 

In [None]:
file_LCT = ImaFile(nr_samples=198, VCF_file_name='LCT.CEU.vcf');
gene_LCT = file_LCT.read_VCF();
gene_LCT.filter_freq(0.01);
gene_LCT.sort('rows_freq');
gene_LCT.sort('cols_freq');
gene_LCT.convert(flip=True);
gene_LCT.plot();
gene_LCT.summary();

### 2. Run and process simulations to be used for training the neural network

In [None]:
# change to your path, e.g.:
# path_sim = '/home/mfumagal/Data/ImaGene/Tutorials/'
# path_sim='/mnt/quobyte/ImaGene/' # for workshop spp1819
path_sim = './'

See tutorial `03_multiclass_for_continuous`. Here I assume simulations are in `path+Continuous`.

### 3. Implement, train and evaluate the neural network

The pipeline for training and testing is the following one.
We resize all images to match the real data.

In [None]:
i = 1
while i <= 10:

    # simulations 
    file_sim = ImaFile(simulations_folder=path_sim+'Continuous/Simulations' + str(i), nr_samples=198, model_name='Marth-3epoch-CEU')

    # retain only 20 data points per class as a quick example
    gene_sim = file_sim.read_simulations(parameter_name='selection_coeff_hetero', max_nrepl=3)
    
    # manipulate data
    gene_sim.filter_freq(0.01)
    gene_sim.sort('rows_freq')
    gene_sim.sort('cols_freq')
    gene_sim.resize((198, int(gene_LCT.dimensions[1][0])))
    gene_sim.convert(flip=True)
    
    # randomise data
    gene_sim.subset(get_index_random(gene_sim))
 
    # at first iteration we build the model 
    # noe that the last layer has a linear activation function
    if i == 1:

        model = models.Sequential([
                    layers.Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.005, l2=0.005), padding='valid', input_shape=gene_sim.data.shape[1:]),
                    layers.MaxPooling2D(pool_size=(2,2)),
                    layers.Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.005, l2=0.005), padding='valid'),
                    layers.MaxPooling2D(pool_size=(2,2)),
                    layers.Conv2D(filters=128, kernel_size=(3,3), strides=(1,1), activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.005, l2=0.005), padding='valid'),
                    layers.MaxPooling2D(pool_size=(2,2)),
                    layers.Flatten(),
                    layers.Dense(units=1)])
        model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])

        net_LCT = ImaNet(name='[C32+P]+[C64+P]+[C128+P]')

    # training for iterations from 1 to 9
    print(i)
    if i < 10:
        score = model.fit(gene_sim.data, gene_sim.targets, batch_size=32, epochs=1, verbose=1, validation_split=0.10)
        net_LCT.update_scores(score)
    else:
        # testing for iteration 10
        net_LCT.test = model.evaluate(gene_sim.data, gene_sim.targets, batch_size=None, verbose=1)
        net_LCT.predict(gene_sim, model)

    i += 1

In [None]:
# set working directory where to save models, e.g. 
# path='/home/mfumagal/Data/ImaGene/Tutorials/Data/' # my local machine
# path='./' # for workshop spp1819
path = './'

In [None]:
# save final (trained) model
model.save(path+'model.cont.h5');
# save testing data
gene_sim.save(path+'gene_sim.cont');
# save network
net_LCT.save(path+'net_LCT.cont');

Recall that to load all these files you can use the following commands.

In [None]:
gene_sim = load_imagene(path+'gene_sim.cont');
net_LCT = load_imanet(path+'net_LCT.cont');
model = load_model(path+'model.cont.h5');

In [None]:
# assess the training
net_LCT.plot_train();

In [None]:
# print the testing results [loss, mae]
print(net_LCT.test);

For the estimation of continuous variables, we can produce a scatter plot with `net.plot_scatter(MAP=False)`.

In [None]:
net_LCT.plot_scatter(MAP=False);

### 4. Deploy the trained network on your genomic data of interest

In [None]:
model.predict(gene_LCT.data, batch_size=None)[0][0]