# Quick start for binary classification

Quick start for binary classification

In [None]:
# bash generate_data_binary.sh

In [None]:
import os
import gzip

import numpy as np
import scipy.stats

import skimage.transform
from keras import models, layers, activations, optimizers, regularizers
from keras.utils import plot_model
from keras.models import load_model

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import pymc3 # this will be removed
import pydot # optional

In [None]:
%run -i ImaGene.py

First iteration.

Read simulations and store them into object.

In [None]:
myfile = ImaFile(simulations_folder='/home/mfumagal/Data/ImaGene.binary/Simulations1.Epoch3', nr_samples=128, model_name='Marth-3epoch-CEU')

In [None]:
mypop = myfile.read_simulations(parameter_name='selection_coeff_hetero', max_nrepl=2000)

Have a look at the data and the allele frequency for the selected allele.

In [None]:
mypop.summary()

In [None]:
freqs = calculate_allele_frequency(mypop, 0.5)
plt.scatter(mypop.targets, freqs, marker='o')
plt.xlabel('Target')
plt.ylabel('Allele frequency')

Manipulate object to: major/minor polarisation, sort rows/cols, resize, convert to float.

In [None]:
mypop.majorminor()
mypop.filter_freq(0.01)
mypop.sort('rows_freq')
mypop.sort('cols_freq')
mypop.resize((128, 128))
mypop.convert(verbose=True)

Plot one image per class as an illustration and check the new dimensions.

In [None]:
for sel in mypop.classes:
    print(sel)
    mypop.plot(np.where(mypop.targets == sel)[0][0])
mypop.summary()

Select only images corresponding to specified classes (e.g. 0 and 300) and check new dimensions.

In [None]:
mypop.classes = np.array([0,300])
classes_idx = get_index_classes(mypop.targets, mypop.classes)
len(classes_idx)

In [None]:
mypop.subset(classes_idx)
mypop.summary()

Randomly shuffle images.

In [None]:
rnd_idx = get_index_random(mypop)
mypop.subset(rnd_idx)

Convert targets to the appropriate format for keras (vectorise them).

In [None]:
mypop.targets = to_binary(mypop.targets)

The data is ready to be used for the classification.
You can save it.

In [None]:
import _pickle as pickle

In [None]:
# to save, create folder
os.mkdir('/home/mfumagal/Data/ImaGene.binary/Images1.Epoch3')

In [None]:
# save
with open('/home/mfumagal/Data/ImaGene.binary/Images1.Epoch3/mypop','wb') as fp:
    pickle.dump(mypop, fp)

In [None]:
# to load:
with open('/home/mfumagal/Data/ImaGene.binary/Images1.Epoch3/mypop','rb') as fp:
    mypop = pickle.load(fp)

Build the model.

In [None]:
%run -i ImaGene.py

In [None]:
mynet = ImaNet()

In [None]:
mynet.model = models.Sequential([
                        layers.Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), activation='relu', padding='valid', input_shape=mypop.data.shape[1:4]),
                        layers.MaxPooling2D(pool_size=(2,2)),
                        layers.Dropout(rate=0.5),
                        layers.Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), activation='relu', padding='valid'),
                        layers.MaxPooling2D(pool_size=(2,2)),
                        layers.Dropout(rate=0.5),
                        layers.Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), activation='relu', padding='valid'),
                        layers.MaxPooling2D(pool_size=(2,2)),
                        layers.Dropout(rate=0.5),
                        layers.Flatten(),
                        #layers.Dense(units=128, activation='relu'),
                        layers.Dense(units=1, activation='sigmoid')])

In [None]:
mynet.model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

In [None]:
mynet.plot_net()

In [None]:
mynet.model.fit(mypop.data, mypop.targets, batch_size=64, epochs=1, verbose=1, validation_split=0.10)

In [None]:
mynet.history = {}

In [None]:
mynet.history.update(history.history)

In [None]:
mynet.history

In [None]:
mynet.model.save('net.h5')

In [None]:
mynet.plot_train()

Iterate across all repetitions (i-th) and models (e-th, if relevant).
The first one initialises the model.
The last one is used for testing.

In [None]:
import os
import gzip

import numpy as np
import scipy.stats

import skimage.transform
from keras import models, layers, activations, optimizers, regularizers
from keras.utils import plot_model
from keras.models import load_model

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import pymc3 # this will be removed
import pydot # optional

In [None]:
import pathlib
import _pickle as pickle

%run -i ImaGene.py
# get_ipython().run_line_magic('run', '-i /rds/general/user/mfumagal/home/Software/ImaGene/ImaGene.py')

for s in [100, 200, 300, 400]:

    for m in ['None', 'Rows', 'Cols', 'RowsCols']:

        for e in [1, 2, 3]:

            folder = '/home/mfumagal/Data/ImaGene/Binary/Results/Epoch' + str(e) + '/S' + str(s) + '/' + str(m)
            print(folder)
            pathlib.Path(folder).mkdir(parents=True, exist_ok=True) 
                       
            i = 0
            while i < 10:

                i += 1
                print(str(s) + str(m) + str(e) + str(i))

                myfile = ImaFile(simulations_folder='/home/mfumagal/Data/ImaGene/Binary/Simulations' + str(i) 
                         + '.Epoch' + str(e), nr_samples=128, model_name='Marth-' + str(e) + 'epoch-CEU')
                mypop = myfile.read_simulations(parameter_name='selection_coeff_hetero', max_nrepl=20)
    
                mypop.majorminor()
                mypop.filter_freq(0.01)
            
                if m == 'Rows':
                    mypop.sort('rows_freq')
                if m == 'Cols':
                    mypop.sort('cols_freq')
                if m == 'RowsCols':
                    mypop.sort('rows_freq')
                    mypop.sort('cols_freq')
                
                mypop.resize((128, 128))
                mypop.convert()
    
                mypop.classes = np.array([0,int(s)])
                classes_idx = get_index_classes(mypop.targets, mypop.classes)
                mypop.subset(classes_idx)
    
                rnd_idx = get_index_random(mypop)
                mypop.subset(rnd_idx)
    
                mypop.targets = to_binary(mypop.targets)
    
                if i == 1:
                    mynet = ImaNet(name='CPx2')
                    mynet.model = models.Sequential([
                        layers.Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01), padding='valid', input_shape=mypop.data.shape[1:4]),
                        layers.MaxPooling2D(pool_size=(2,2)),
                        #layers.Dropout(rate=0.5),
                        layers.Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01), padding='valid'),
                        layers.MaxPooling2D(pool_size=(2,2)),
                        #layers.Dropout(rate=0.5),
                        #layers.Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), activation='relu', padding='valid'),
                        #layers.MaxPooling2D(pool_size=(2,2)),
                        #layers.Dropout(rate=0.5),
                        layers.Flatten(),
                        layers.Dense(units=64, activation='relu'),
                        layers.Dense(units=1, activation='sigmoid')])
                    mynet.model.compile(optimizer='rmsprop',
                        loss='binary_crossentropy',
                        metrics=['accuracy'])
                    mynet.plot_net(summary=True, file=folder + '/net.png')
                else:
                    model = load_model(folder + '/net.h5')
    
                if i < 10:
                    score = mynet.model.fit(mypop.data, mypop.targets, batch_size=32, epochs=1, verbose=0, validation_split=0.10)
                    mynet.update_scores(score)
                    mynet.model.save(folder + '/net.h5')
                else:
                    mynet.test = mynet.model.evaluate(mypop.data, mypop.targets, batch_size=None, verbose=0)
                    print(mynet.test)

            # save the latest data (testing data)
            with open(folder + '/mypop','wb') as fp:
                pickle.dump(mypop, fp)
            # save the latest network
            with open(folder + '/mynet','wb') as fp:
                pickle.dump(mynet, fp)
        
            del mypop
            del mynet


Read results.

In [None]:
for e in [1, 2, 3]:

    folder = '/home/mfumagal/Data/ImaGene/Binary/Results/Epoch' + str(e)
    
    with open(folder + '/mypop','rb') as fp:
        mypop = pickle.load(fp)

    with open(folder + '/mynet','rb') as fp:
        mynet = pickle.load(fp)
        
    

In [None]:
e=3

folder = '/home/mfumagal/Data/ImaGene/Binary/Results/Epoch' + str(e)
    
with open(folder + '/mypop','rb') as fp:
    mypop = pickle.load(fp)

with open(folder + '/mynet','rb') as fp:
    mynet = pickle.load(fp)
    
    


In [None]:
mynet.test
mynet.plot_train()
mynet.plot_cm(mypop)
#mynet.plot_scatter(mypop)

In [None]:
os.system('mkdir -p /home/mfumagal/Data/ImaGene/Binary/Results/Epoch3')

os.system('mv net.h5 /home/mfumagal/Data/ImaGene/Binary/Results/Epoch3/.')
os.system('mv net.png /home/mfumagal/Data/ImaGene/Binary/Results/Epoch3/.')

with open('/home/mfumagal/Data/ImaGene/Binary/Results/Epoch3/mypop','wb') as fp:
    pickle.dump(mypop, fp)
    
with open('/home/mfumagal/Data/ImaGene/Binary/Results/Epoch3/mynet','wb') as fp:
    pickle.dump(mynet, fp)

In [None]:
os.system('mkdir -p /home/mfumagal/Data/ImaGene/Binary/Results/Epoch3')

In [None]:
mynet.plot_train()

In [None]:
mynet.plot_cm(mypop)

In [None]:
mynet.plot_scatter(mypop)

In [None]:
e = 1
folder = '/home/mfumagal/Data/ImaGene/Binary/Results/Epoch' + str(e)
print(folder)
subprocess.call(['mkdir -p', folder], shell=True)