In [6]:
from relationGraph import Relation, RelationGraph, MatrixOfRelationGraph
from autoencoder import seedy, AutoEncoder
import utilityFunctions as uf
from main import test_build_relation_graph_with_symertic_data, test_convert_graph_to_2D_matrix, test_get_matix_for_autoencoder, test_autoencoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from base import load_source
from os.path import join
import numpy as np

In [2]:
gene = 'Gene'
go_term = 'GO term'
exprc = 'Experimental condition'

data, rn, cn = load_source(join('dicty', 'dicty.gene_annnotations.csv.gz'))
ann = Relation(data=data, x_name=gene, y_name=go_term, name='ann',
               x_metadata=rn, y_metadata=cn)

data, rn, cn = load_source(join('dicty', 'dicty.gene_expression.csv.gz'))
expr = Relation(data=data, x_name=gene, y_name=exprc, name='expr',
                x_metadata=rn, y_metadata=cn)
expr.matrix = np.log(np.maximum(expr.matrix, np.finfo(np.float).eps))

data, rn, cn = load_source(join('dicty', 'dicty.ppi.csv.gz'))
ppi = Relation(data=data, x_name=gene, y_name=gene, name='ppi',
               x_metadata=rn, y_metadata=cn)

ann_t = ann.transpose()
expr_t = expr.transpose()

relationGraph = RelationGraph()
relationGraph.add_relations([ann, expr, ppi, ann_t, expr_t])
relationGraph.display_objects()
graph = relationGraph

-------------RelationGraph-------------
Gene	1219
3	ann-(1219, 116), expr-(1219, 282), ppi-(1219, 1219)
2	ann_T-(116, 1219), expr_T-(282, 1219)
GO term	116
1	ann_T-(116, 1219)
1	ann-(1219, 116)
Experimental condition	282
1	expr_T-(282, 1219)
1	expr-(1219, 282)



In [21]:
mrg = MatrixOfRelationGraph(graph=graph)
mrg.convert_to_2D_matrix()
mrg.display_metadata_2D_matrix()
data = mrg.density_data(.2)
print(data.shape)
fn = '/data/samples/' + str(data.shape[0]) + '_data.npz'
# fn = '/data/samples/' + str(data.shape[0]) + '_ord_data.npz'    // original data for prediciton
print(fn)
print(data.shape)

# f = np.load('/data/samples/org_data.npz')
# data = f[f.files[0]]

print(data.shape)

ann (1219, 116)
expr (1219, 282)
ppi (1219, 1219)
ann_T (116, 1219)
expr_T (282, 1219)
-------------2D Matrix-------------
Objects: Gene: (0, (0, 1218)), GO term: (1, (1219, 1334)), Experimental condition: (2, (1335, 1616))
[1. 1. 1.]
[1. 0. 0.]
[1. 0. 0.]

Gene: 244
GO term: 23
Experimental condition: 56
(323, 323)
/data/samples/323_data.npz
(323, 323)
(323, 323)


In [7]:
import numpy as np
import tempfile

class my_savez(object):
    def __init__(self, file):
        # Import is postponed to here since zipfile depends on gzip, an optional
        # component of the so-called standard library.
        import zipfile
        # Import deferred for startup time improvement
        import tempfile
        import os

        if isinstance(file, str):
            if not file.endswith('.npz'):
                file = file + '.npz'

        compression = zipfile.ZIP_STORED

        zip = self.zipfile_factory(file, mode="w", compression=compression)

        # Stage arrays in a temporary file on disk, before writing to zip.
        fd, tmpfile = tempfile.mkstemp(suffix='-numpy.npy')
        os.close(fd)

        self.tmpfile = tmpfile
        self.zip = zip
        self.i = 0

    def zipfile_factory(self, *args, **kwargs):
        import zipfile
        import sys
        if sys.version_info >= (2, 5):
            kwargs['allowZip64'] = True
        return zipfile.ZipFile(*args, **kwargs)

    def savez(self, *args, **kwds):
        import os
        import numpy.lib.format as format

        namedict = kwds
        for val in args:
            key = 'arr_%d' % self.i
            if key in namedict.keys():
                raise ValueError("Cannot use un-named variables and keyword %s" % key)
            namedict[key] = val
            self.i += 1

        try:
            for key, val in namedict.items():
                fname = key + '.npy'
                fid = open(self.tmpfile, 'wb')
                try:
                    format.write_array(fid, np.asanyarray(val))
                    fid.close()
                    fid = None
                    self.zip.write(self.tmpfile, arcname=fname)
                finally:
                    if fid:
                        fid.close()
        finally:
            os.remove(self.tmpfile)

    def close(self):
        self.zip.close()

# tmp = '/mag/test.npz'
# f = my_savez(tmp)
# for i in range(10):
#   array = np.zeros(10)
#   f.savez(array)
# f.close()

# # tmp.seek(0)

# tmp_read = np.load(tmp)
# print (tmp_read.files)
# for k, v in tmp_read.iteritems():
#      print (k, v)

In [20]:
import multiprocessing
import time
from pathlib import Path

def mp_worker(arr):
#     new_data = uf.sample_generator3(data, num_of_samples=100, density=0.7, 1)
    np.random.seed(arr[3])
    new_data = uf.sample_generator3(arr[0], num_of_samples=arr[1], density=arr[2])
    return new_data


def data_generator(data, n_samples, pools, density, filename):
    batch_size = 100
    p = multiprocessing.Pool(pools)
    gen_samples = np.empty((0, data.shape[0] * data.shape[1]))
    iterations = int(np.round((n_samples -1)/batch_size)) + 1
    
    params = [[data, batch_size, density, i] for i in range(iterations)]
    
    i = 1
    f = my_savez(filename)
    for result in p.imap(mp_worker, params):
        print('samples: ' + str(i * batch_size))
        i+=1
        f.savez(result)
    f.close()
    
    return gen_samples

# fn = '/data/samples/162_data.npz'
# d = data_generator(data, 50000, 8, 0.8, fn)
# fn = '/data/samples/org_data.npz'
# f = my_savez(fn)
# f.savez(data)
# f.close()

print('Finnish!!!')

Finnish!!!


In [4]:
x, y = data.shape
data=data.reshape(1, x * y)
ae = AutoEncoder(encoding_dim=20, data=data)
ae.encoder_decoder()
# ae.fit(batch_size=250, epochs=100)
# fn = '/mag/483_data.npz'
ae.fit_generator(fn, n_packs=10, epochs=200)
ae.save()

encoder = ae.load_encoder()
decoder = ae.load_decoder()

# test_data = np.asarray([data[0].flatten()])
test_data = np.asarray([data.flatten()])
# print(test_data)
# np.random.shuffle(test_data[0])
# print(test_data)
print(test_data.shape)

x = encoder.predict(test_data)
y = decoder.predict(x)

mse = mean_squared_error(test_data, y)
print('MSE: ' + str(mse))

[[4.31138956 4.33326959 4.7106287  ... 4.80063006 2.11492933 4.1626105 ]]
(1, 104329)
104329
/data/weights/323
Epoch 1/200
Epoch 2/200
Epoch 3/200
(1, 104329)


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [7]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Masking
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
import autoencoder as ae

def load_data(filename, n_packs):
    f = np.load(filename)
    files = f.files
    
    while n_packs > 0:
        rand_num = np.random.randint(len(files))
        x = f[files[rand_num]]
        print(x.shape)
        
        n_packs -= 1

        
x,y = data.shape
data=data.reshape(1, x * y)
input_dim = data.shape[1]
epochs = 200
encoding_dim = 20
n_packs = 250

model = Sequential()
model.add(Masking(mask_value=0, input_shape=(input_dim, )))
model.add(Dense(encoding_dim, input_shape=(input_dim, ), activation='relu'))
model.add(Dense(input_dim))
model.compile(loss='mse', optimizer='sgd')

log_dir = '/data/logs/'
callbacks = [
            TensorBoard(log_dir=log_dir, histogram_freq=0, write_graph=True, write_images=True),
            EarlyStopping(monitor='loss', min_delta=0, patience=3, verbose=0, mode='auto')
        ]

model.fit_generator(ae.data_generator(fn, n_packs), steps_per_epoch=n_packs, epochs=epochs, callbacks=callbacks)

model.save('/data/sequential/weights/model.h5')

# decoded_imgs = model.predict(data)

# mse = mean_squared_error(data, decoded_imgs)
# print('MSE: ' + str(mse))

Epoch 1/200


ResourceExhaustedError: OOM when allocating tensor with shape[20,104329] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: training/SGD/mul_4 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](training/SGD/ReadVariableOp_11, training/SGD/mul_4/ReadVariableOp)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Node: loss/mul/_47 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_282_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


In [None]:
f = np.load(filename)
test_data = f[f.files[0]]

# prediction with normal data
model = load_model.save('/data/sequential/weights/model.h5')
y = model.predict(test_data)
mse = mean_squared_error(data, decoded_imgs)
print('MSE: ' + str(mse))
print()

# prediction with shuffled data
np.random.shuffle(test_data)
y = model.predict(test_data)
mse = mean_squared_error(data, decoded_imgs)
print('MSE: ' + str(mse))
print()
print('m')