In [1]:
#!/usr/bin/env python

'''
DESCRIPTION
-----------
    Training model and exporting trained model
    
RETURN
------
    {MODEL}.h5 : h5 file
        Trained model
    {MODEL-RESULT}.csv : csv file
        The model result with probabilities, prediction label and ground truth

EXPORTED FILE(s) LOCATION
-------------------------
    ./models/{NN or CV}/{EXPERIMENT}/{MODEL}.h5
    ./models/{NN or CV}/{EXPERIMENT}/{MODEL-RESULT}.csv
'''

# importing default libraries
# import os, argparse, sys
# sys.path.append('./')
import os, sys
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
os.chdir(ROOT_DIR)
sys.path.append(ROOT_DIR)

In [2]:
# importing scripts in scripts folder
from scripts import config as src
# importing default libraries
import numpy as np
import pandas as pd
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, train_test_split, LeaveOneGroupOut, LeaveOneGroupOut
from sklearn.preprocessing import OneHotEncoder

import tensorflow as tf
from tensorflow import keras
from numba import cuda

# DEFAULT VALUES for PAPER DESIGN
epochs_default=100
batch_default=10
val_split=0.1

rand_state = 91
shuffle_=True
test_size = 0.3 # train_test_split
kf_split = 5 # KFold

time_start = dt.datetime.now().time().strftime('%H:%M:%S') # = time.time() dt.datetime.now().strftime('%Y%m%d_%I%M%S%p')

**** scripts/config.py IMPORTED!!!
**** PROJECT FOLDER ,  /home/pgundogdu/projects/signalization_prior_knowledge_based_nn


In [3]:
# dataset              = 'reference_log1p.pck'
# bio_knowledge        = 'pbk_layer_hsa.txt'
# NN_or_CV             = 'NN'
# experiment           = 'exper_melanoma'

dataset              = 'exper_immune_raw_sw_log1p.pck'
bio_knowledge        = 'pbk_layer_hsa.txt'
NN_or_CV             = 'NN'
experiment           = 'exper_immune'


# the output location
loc_output = os.path.join(src.DIR_MODELS, NN_or_CV, experiment)
src.define_folder(loc_=loc_output)

FOLDER information,  ./models/NN/exper_immune/


'./models/NN/exper_immune/'

In [16]:
print('FILE FORMAT, ', dataset.split('.')[1])

if dataset.split('.')[1]=='pck':
    df_raw = pd.read_pickle(os.path.join(src.DIR_DATA_PROCESSED, experiment, dataset))
    df_raw = pd.concat([(df_raw.iloc[:, :-1]).astype(float) ,df_raw.iloc[:, -1]], axis=1)
else:
    df_raw = pd.read_csv(os.path.join(srp.DIR_DATA_PROCESSED, experiment, dataset))

# SORTING GENE LIST
sort_genes = sorted(df_raw.columns[:-1])
sort_genes.extend(df_raw.columns[-1:])
df_raw = df_raw[sort_genes]

# Importing all prior biological knowledge and combine all genes to create a common gene list
list_gene = None
if (bio_knowledge!=None):
    df_bio = pd.DataFrame(pd.read_csv(os.path.join(src.DIR_DATA_PROCESSED, bio_knowledge), index_col=0)).sort_index()
    df_bio_filtered = df_bio.iloc[df_bio.index.isin(df_raw.columns), :]
    

print('Dataset cell type, ', df_raw.groupby('cell_type').size())
print('\nDataset shape             , ', df_raw.shape)
print('Biological knowledge shape, ', df_bio_filtered.shape)

print('\nDataset gene order top 10              ,', list(df_raw.columns[:10]))
print('Biological knowledge gene order top 10, ', list(df_bio_filtered.index[:10].values))

FILE FORMAT,  pck
Dataset cell type,  cell_type
DC              957
ILC             574
NK               74
Tcell         23484
macrophage      503
dtype: int64

Dataset shape             ,  (25592, 2090)
Biological knowledge shape,  (2089, 93)

Dataset gene order top 10              , ['a2m', 'abcb4', 'abcc2', 'abhd5', 'abi2', 'abl1', 'abl2', 'ablim1', 'ablim3', 'acaa1']
Biological knowledge gene order top 10,  ['a2m', 'abcb4', 'abcc2', 'abhd5', 'abi2', 'abl1', 'abl2', 'ablim1', 'ablim3', 'acaa1']


In [17]:
ohe = OneHotEncoder()
X = df_raw.iloc[:, :-1].values
y = df_raw.iloc[:, -1:].values
y_ohe = ohe.fit_transform(y).toarray()
# groups = y.reshape(1,-1)[0]

print(X.shape)
print(y.shape)
# print(groups.shape)

(25592, 2089)
(25592, 1)


In [18]:
callbacks = [keras.callbacks.EarlyStopping(monitor="val_loss" # Stop training when `val_loss` is no longer improving
                                           , min_delta=1e-5   # "no longer improving" being defined as "no better than 1e-5 less"
                                           , patience=3       # "no longer improving" being further defined as "for at least 3 epochs"
                                           , verbose=1 ) ]

In [19]:
split='train_test_split'
# split='train_test_split'
X_train, y_train, X_test, y_test = [], [], [], []

if split == 'KFold':

    kf = KFold(n_splits=kf_split, shuffle=shuffle_ , random_state=rand_state)

    print('KFold split applied!! The number of KFold is {}'.format(kf.get_n_splits()))
    for train_index, test_index in kf.split(X, y): # so.split(X, y)
        print(train_index, len(train_index))

        X_train.append(X[train_index])
        X_test.append(X[test_index])
        y_train.append(y_ohe[train_index])
        y_test.append(y_ohe[test_index])

    print(np.array(X_train).shape)

elif split=='train_test_split':
    print('train_test_split split applied! Test size is, ', test_size)
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y
                                                    , test_size=test_size
                                                    , shuffle=shuffle_
                                                    , random_state=rand_state)
    
    X_train.append(Xtrain)
    X_test.append(Xtest)
    y_train.append(ohe.transform(ytrain).toarray())
    y_test.append(ohe.transform(ytest).toarray())

    print(np.array(X_train).shape)
    
    
elif split=='LOGO':
    logo = LeaveOneGroupOut()
    for train_index, test_index in logo.split(X, y, y.reshape(1,-1)[0]):
        print("TRAIN:", len(train_index), "TEST:", len(test_index))
        X_train.append(X[train_index])
        X_test.append(X[test_index])
        y_train.append(y_ohe[train_index])
        y_test.append(y_ohe[test_index])

train_test_split split applied! Test size is,  0.3
(1, 17914, 2089)


In [21]:
df_nn = pd.DataFrame()
for i in range(len(X_train)):
    print(len(X_train))
    time_start = dt.datetime.now().time().strftime('%H:%M:%S') # = time.time()
    model_a1 = src.proposed_NN(X=X, y=y, bio_layer=df_bio_filtered, design_type='bio')
    model_a1.fit(X_train[i], y_train[i]
              , epochs=epochs_default
              , batch_size=batch_default
              , verbose=1
              , callbacks=callbacks
              , validation_split=val_split)
#     y_pred_a1 = model_a1.predict(X_test[i])
    
#     print('model deleted!!')
#     del(model_a1)
    
#     df_proba = pd.DataFrame(y_pred_a1, columns=list(pd.DataFrame(ohe.categories_).iloc[0,:]))
#     df_pred = pd.DataFrame(ohe.inverse_transform(y_pred_a1).reshape(1,-1)[0], columns=['prediction'])
#     df_ground_truth = pd.DataFrame(ohe.inverse_transform(np.array(y_test)[i]).reshape(1,-1)[0], columns=['ground_truth'])
#     df_nn_a1 = pd.concat([df_proba, df_pred, df_ground_truth], axis=1)
#     df_nn_a1['design'] ='a1'
#     df_nn_a1['index_split'] = i
#     df_nn_a1['split'] = split
#     df_nn = pd.concat([df_nn, df_nn_a1])
#     time_end  = dt.datetime.now().time().strftime('%H:%M:%S') # = time.time()
#     print('\nELAPSED TIME, ', (dt.datetime.strptime(time_end,'%H:%M:%S') - dt.datetime.strptime(time_start,'%H:%M:%S')))
    
    time_start = dt.datetime.now().time().strftime('%H:%M:%S') # = time.time()
    model_a2 = src.proposed_NN(X=X, y=y, bio_layer=df_bio_filtered, design_type='bio', second_layer=True)
    model_a2.fit(X_train[i], y_train[i]
              , epochs=epochs_default
              , batch_size=batch_default
              , verbose=1
              , callbacks=callbacks
              , validation_split=val_split)
#     y_pred_a2 = model_a2.predict(X_train[i])
#     del(model_a2)
    
#     df_proba = pd.DataFrame(y_pred_a2, columns=list(pd.DataFrame(ohe.categories_).iloc[0,:]))
#     df_pred = pd.DataFrame(ohe.inverse_transform(y_pred_a2).reshape(1,-1)[0], columns=['prediction'])
#     df_ground_truth = pd.DataFrame(ohe.inverse_transform(np.array(y_test)[i]).reshape(1,-1)[0], columns=['ground_truth'])
#     df_nn_a2 = pd.concat([df_proba, df_pred, df_ground_truth], axis=1)
#     df_nn_a2['design'] ='a2'
#     df_nn_a2['index_split'] = i
#     df_nn_a2['split'] = split
#     df_nn = pd.concat([df_nn, df_nn_a2])
#     time_end  = dt.datetime.now().time().strftime('%H:%M:%S') # = time.time()
#     print('\nELAPSED TIME, ', (dt.datetime.strptime(time_end,'%H:%M:%S') - dt.datetime.strptime(time_start,'%H:%M:%S')))

# df_nn.to_csv(os.path.join(loc_output,'model_result_'+split+'.csv'), index=False)
    model_a1.save(os.path.join(loc_output, 'model_a1_'+dataset.split('.')[0]+'_'+split+'_'+str(i)+'.h5'))
    model_a2.save(os.path.join(loc_output, 'model_a2_'+dataset.split('.')[0]+'_'+split+'_'+str(i)+'.h5'))

1
  -> Network designed with prior biological knowledge with 93 nodes in first hidden layer.
------------- NETWORK DESIGN - ARGUMENTS -------------
-- X.shape                  , (25592, 2089)
-- y.shape                  , (25592, 1)
-- bio_layer.shape          , (2089, 93)
-- design_type              , bio
------------- NETWORK DESIGN - CALCULATED -------------
-- input_size               , 2089
-- first_hidden_layer_size  , 93
-- size_output_layer        , 5


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')
first_hidden_layer_size 93
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer1 (Dense)               (None, 93)                194370    
____________________________________________

In [22]:
import numba
numba.cuda.close()