In [1]:
import os
import numpy as np
import tensorflow as tf
from keras import optimizers
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from segnet import segnet
from generator import DataGenerator

Using TensorFlow backend.


In [2]:
# hyperparameters
nv=int(2**16) # variants
na=2          # alleles
nc=7          # ancestry classes
ne=100        # number of epochs
gen=True      # use data generator
hor=False     # use multi-gpu
oce=True      # include oceanian samples
dev=False     # do we have a dev set

# reproducibility
# np.random.seed(23910464)
np.random.seed(1)

In [3]:
# configure horovod if we're using multiple gpus
# - note that this doesn't work within a jupyter notebook
# - but you can use it to assign one of the two gpus to the tf instance
# - ((probably))
if hor: 
    import horovod.keras as hvd 

    # initialize horovod instance -- this currently only works on galangal
    hvd.init()

    # assign GPUs to horovod 
    gpus = tf.config.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        # tf.config in tf <= 1.6
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
    print(gpus)

In [4]:
# sanity check that we're on gpu -- use #1
os.environ["CUDA_VISIBLE_DEVICES"]="1"
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
# declare model
model=segnet(input_shape=(nv, na), n_classes=nc-1+int(oce), n_filters=8)

# and optimizer
if hor:
    adam=optimizers.Adam(lr=1e-5 * hvd.size())
    adam=hvd.DistributedOptimizer(adam)
else:
    adam=optimizers.Adam(lr=1e-4)

# now compile and show parameter summary
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy']) 
print(model.summary())

Model: "segnet"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 65536, 2)     0                                            
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 65536, 2)     0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1_down1 (Conv1D)            (None, 65536, 8)     264         dropout_1[0][0]                  
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 65536, 8)     0           conv1_down1[0][0]                
_____________________________________________________________________________________________

In [6]:
# load data
data_root='/home/magu/deepmix/data/reference_panel/'
X = np.load(data_root+'unzipped/panel_chr20.G.npy')#, mmap_mode='r')
Y = np.load(data_root+'unzipped/panel_chr20.L.npy')#, mmap_mode='r')
S = np.load(data_root+'unzipped/panel_chr20.S.npy')
print([X.shape, Y.shape, X.shape])

[(2764, 516801, 2), (2764, 516801, 7), (2764, 516801, 2)]


In [7]:
# and train individuals
train=np.loadtxt('../data/reference-panel/train.strands.txt', dtype=str)
train_ix=[i for i,q in enumerate(S) if q in train]
np.random.shuffle(train_ix)
print(len(train_ix))

2290


In [8]:
# and some dev individuals, why not -- first pick their indexes
if dev:
    n=100
    S=np.load(data_root+'simulated/label/dev_10gen.result.npz')['S']
    s=np.random.choice(S, size=n, replace=False)

    # then load and subset -- AMR is the first ancestry label, ignored for now
    x_f=data_root+'simulated/numpy/dev_10gen.query.ALL_X.npz'
    y_f=data_root+'simulated/label/dev_10gen.result.npz'
    S_f=np.load(x_f)['S']
    X_dev=np.load(x_f)['G'][[np.where(S_f==(i))[0][0] for i in s],:nv,:na]
    S_f=np.load(y_f)['S']
    Y_dev=to_categorical(np.load(y_f)['L'][[np.where(S_f==(i))[0][0] for i in s],:nv], dtype='bool')[:,:,1:]
    print([X_dev.shape, Y_dev.shape])

In [10]:
# now try it out!
if dev: # dev assumes oce
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=25) # not implemented below

    # fit with generator, or not
    if gen:
        params={'X':X, 'Y':Y, 'dim':nv, 'batch_size':32, 'n_classes':nc, 'n_alleles':na}
        generator=DataGenerator(train_ix, **params)
        history=model.fit_generator(generator=generator, epochs=ne, validation_data=(X_dev, Y_dev), callbacks=[es])
    else:
        history=model.fit(X[train_ix,:nv,:na], Y[train_ix,:nv,:], batch_size=4, epochs=ne, callbacks=[es])
elif not oce:
    train_ixx=[i for i in train_ix if Y[i,0,:].dot(np.arange(nc))!=4]
    history=model.fit(X[train_ixx,:nv,:na], Y[train_ixx,:nv,:][:,:,[i for i in range(nc) if i!=4]], 
                      batch_size=4, epochs=ne)
else:
    history=model.fit(X[train_ix,:nv,:na], Y[train_ix,:nv,:], batch_size=4, epochs=ne)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [11]:
# save
model.save('chm21_short.h5')

In [None]:
_, dev_acc = model.evaluate(X_dev, Y_dev, verbose=0)

# 1.1) plot loss during training
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(1, (9,9))
plt.subplot(211)
plt.title('Loss during training')
plt.plot(history.history['loss'], label='train set')
plt.plot(history.history['val_loss'], label='dev set')
plt.legend()

# 1.2) plot accuracy during training
plt.subplot(212)
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train set')
plt.plot(history.history['val_accuracy'], label='dev set')
plt.legend()

print(dev_acc)

In [None]:
Y_hat_p=model.predict(X_dev)
Y_hat=np.argmax(Y_hat_p, axis=-1)

In [None]:
Y_hat.shape

In [None]:
for i in range(Y_hat.shape[0]):
    print((i, [np.count_nonzero(Y_hat[i,:]==j) for j in range(Y_hat_p.shape[-1])], 
           [np.count_nonzero(Y_dev[i,:,:].argmax(axis=-1)==j) for j in range(Y_hat_p.shape[-1])]))

In [None]:
plt.figure(figsize=(12, 3))
Y_dev_lab = np.argmax(Y_dev, axis=-1)
plt.subplot(211)
plt.title('Dev set ground truths')
plt.imshow(Y_dev_lab[48:49,:].astype(int), aspect='auto')#, cmap='jet')

plt.subplot(212)
plt.title('Corresponding dev set predictions')
plt.imshow(Y_hat[48:49,:].astype(int), aspect='auto')#, cmap='jet')

In [None]:
np.where(np.argmax(Y_hat_p[48,:,:], axis=-1)==3)

In [None]:
plt.figure(figsize=(12, 3))
Y_dev_lab = np.argmax(Y_dev, axis=-1)
plt.subplot(211)
plt.title('Dev set ground truths')
plt.imshow(Y_dev_lab[48:49,29000:29500].astype(int), aspect='auto')#, cmap='jet')

plt.subplot(212)
plt.title('Corresponding dev set predictions')
plt.imshow(Y_hat[48:49,29000:29500].astype(int), aspect='auto')#, cmap='jet')

In [None]:
Y_hat_p[48,np.where(np.argmax(Y_hat_p[48,:,:], axis=-1)==3),:]

In [None]:
# seems like a crf-smoother (even a post-hoc one) could really help