In [1]:
#!/usr/bin/env python

'''
DESCRIPTION
-----------
    Training model and exporting trained model
    
RETURN
------
    {MODEL}.h5 : h5 file
        Trained model
    {MODEL-RESULT}.csv : csv file
        The model result with probabilities, prediction label and ground truth

EXPORTED FILE(s) LOCATION
-------------------------
    ./models/{NN or CV}/{EXPERIMENT}/{MODEL}.h5
    ./models/{NN or CV}/{EXPERIMENT}/{MODEL-RESULT}.csv
'''

# importing default libraries
# import os, argparse, sys
# sys.path.append('./')
import os, sys
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
os.chdir(ROOT_DIR)
sys.path.append(ROOT_DIR)

In [2]:
# importing scripts in scripts folder
from scripts import settings as ssrp, dataset_scripts as dsrp, path_scripts as psrp, model_scripts as msrp, nn_design_scripts as nnsrp
# importing default libraries
import numpy as np
import pandas as pd
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, train_test_split, LeaveOneGroupOut, RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder

import tensorflow as tf
from tensorflow import keras
from numba import cuda

# DEFAULT VALUES for PAPER DESIGN
epochs_default=100
batch_default=10

rand_state = 91
shuffle_=True
n_split = 5 # number of split
n_repeat = 1 # number of repetation

time_start = dt.datetime.now().time().strftime('%H:%M:%S') # = time.time() dt.datetime.now().strftime('%Y%m%d_%I%M%S%p')

scripts/settings.py - PATHS IMPORTED!!!


In [3]:
dataset              = 'Immune_sw_log1p.pck'
bio_knowledge        = 'pbk_layer_hsa.txt'
NN_or_CV             = 'CV'
experiment           = 'exper_pbmc'

# the output location
loc_output = os.path.join(ssrp.DIR_MODELS, NN_or_CV, experiment)
psrp.define_folder(loc_=loc_output)

'./models/CV/exper_pbmc/'

In [5]:
print('FILE FORMAT, ', dataset.split('.')[1])

if dataset.split('.')[1]=='pck':
    df_processed = pd.read_pickle(os.path.join(ssrp.DIR_DATA_PROCESSED, experiment, dataset))
    df_processed = pd.concat([(df_processed.iloc[:, :-1]).astype(float) ,df_processed.iloc[:, -1]], axis=1)
else:
    df_processed = pd.read_csv(os.path.join(ssrp.DIR_DATA_PROCESSED, experiment, dataset))

sort_genes = sorted(df_processed.columns[:-1])
sort_genes.extend(df_processed.columns[-1:])
df_processed = df_processed[sort_genes]
    
# Importing all prior biological knowledge and combine all genes to create a common gene list
list_gene = None
if (bio_knowledge!=None):
    df_bio = pd.DataFrame(pd.read_csv(os.path.join(ssrp.DIR_DATA_PROCESSED, bio_knowledge), index_col=0)).sort_index()
    df_bio_filtered = df_bio.iloc[df_bio.index.isin(df_processed.columns), :]
    

    
# sort_genes = sorted(df_processed.columns[:-1])
# df_bio_filtered = df_bio.iloc[df_bio.index.isin(sort_genes), :]
# if sort_genes == list(df_bio_filtered.index):
#     print('Dataset and biological info are same ordered!')

# sort_genes.extend(df_processed.columns[-1:])
# df_processed = df_processed[sort_genes]


print('Dataset cell type,\n',df_processed['cell_type'].value_counts())
print('\nDataset shape             , ', df_processed.shape)
print('Biological knowledge shape, ', df_bio_filtered.shape)

# print('\nDataset gene order top 10              ,', list(df_processed.columns[:10]))
# print('Biological knowledge gene order top 10, ', list(df_bio_filtered.index[:10].values))

FILE FORMAT,  pck
Dataset cell type,
 CD14      2500
Treg      2500
CD34      2500
Cyt       2500
CD56      2500
Memory    2500
CD19      2500
Name: cell_type, dtype: int64

Dataset shape             ,  (17500, 2349)
Biological knowledge shape,  (2348, 93)


In [6]:
ohe = OneHotEncoder()
X = df_processed.iloc[:, :-1]


if np.all(X.columns == df_bio_filtered.index):
    print('Dataset and biological info are same ordered!')
# X = X.values
# y = df_raw.iloc[:, -1:].values
# y_ohe = ohe.fit_transform(y).toarray()
# groups = y.reshape(1,-1)[0]

# print(X.shape)
# print(y.shape)
# print(groups.shape)

Dataset and biological info are same ordered!


In [34]:
X_train, y_train, X_test, y_test = [], [], [], []

rskf = RepeatedStratifiedKFold(n_splits=n_split, n_repeats=n_repeat, random_state=rand_state)
for i, indexes in enumerate(rskf.split(X, y, groups=y)):
    
    train_index=indexes[0]
    test_index=indexes[1]
    print(dt.datetime.now().time().strftime('%H:%M:%S'))
    print(i, " - TRAIN:", train_index[:10], "TEST:", test_index[:10])
    print(i, " - TRAIN:", train_index[-3:], "TEST:", test_index[-3:])
    
    print(len(test_index), len(train_index))
    
    X_train.append(X[train_index])
    X_test.append(X[test_index])
    y_train.append(y_ohe[train_index])
    y_test.append(y_ohe[test_index])




18:55:00
0  - TRAIN: [ 0  3  4  6  8  9 10 13 14 16] TEST: [ 1  2  5  7 11 12 15 19 22 30]
0  - TRAIN: [17496 17497 17498] TEST: [17490 17495 17499]
3500 14000
18:55:01
1  - TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [13 17 23 24 28 29 34 42 52 56]
1  - TRAIN: [17497 17498 17499] TEST: [17475 17488 17489]
3500 14000
18:55:01
2  - TRAIN: [ 0  1  2  3  5  6  7  9 11 12] TEST: [ 4  8 10 14 21 27 36 41 47 48]
2  - TRAIN: [17497 17498 17499] TEST: [17485 17492 17494]
3500 14000
18:55:01
3  - TRAIN: [ 0  1  2  4  5  7  8 10 11 12] TEST: [ 3  6  9 20 31 37 39 43 46 49]
3  - TRAIN: [17496 17498 17499] TEST: [17491 17493 17497]
3500 14000
18:55:01
4  - TRAIN: [ 1  2  3  4  5  6  7  8  9 10] TEST: [ 0 16 18 25 26 32 33 38 44 50]
4  - TRAIN: [17495 17497 17499] TEST: [17487 17496 17498]
3500 14000


In [None]:
df_nn = pd.DataFrame()
for i in range(len(X_train)):
    print(len(X_train))
    
    model_a1, y_pred_a1 = nnsrp.NN_design(train_X=X_train[i]
                                          , train_y=y_train[i]
                                          , test_X=X_test[i]
                                          , groups=groups
                                          , bio_layer=df_bio_filtered
                                          , size_epochs=epochs_default
                                          , size_batch=batch_default
                                          , design_type='bio'
                                          , val_split=0.1)
    
    
    df_proba = pd.DataFrame(y_pred_a1, columns=list(pd.DataFrame(ohe.categories_).iloc[0,:]))
    df_pred = pd.DataFrame(ohe.inverse_transform(y_pred_a1).reshape(1,-1)[0], columns=['prediction'])
    df_ground_truth = pd.DataFrame(ohe.inverse_transform(np.array(y_test)[i]).reshape(1,-1)[0], columns=['ground_truth'])
    df_nn_a1 = pd.concat([df_proba, df_pred, df_ground_truth], axis=1)
    df_nn_a1['design'] ='a1'
    df_nn_a1['index_split'] = i
    df_nn_a1['split'] = split
    df_nn = pd.concat([df_nn, df_nn_a1])
    
    model_a2, y_pred_a2 = nnsrp.NN_design(train_X=X_train[i]
                                          , train_y=y_train[i]
                                          , test_X=X_test[i]
                                          , groups=groups
                                          , bio_layer=df_bio_filtered
                                          , size_epochs=epochs_default
                                          , size_batch=batch_default
                                          , design_type='bio'
                                          , val_split=0.1
                                          , second_layer=True)
    
    df_proba = pd.DataFrame(y_pred_a2, columns=list(pd.DataFrame(ohe.categories_).iloc[0,:]))
    df_pred = pd.DataFrame(ohe.inverse_transform(y_pred_a2).reshape(1,-1)[0], columns=['prediction'])
    df_ground_truth = pd.DataFrame(ohe.inverse_transform(np.array(y_test)[i]).reshape(1,-1)[0], columns=['ground_truth'])
    df_nn_a2 = pd.concat([df_proba, df_pred, df_ground_truth], axis=1)
    df_nn_a2['design'] ='a2'
    df_nn_a2['index_split'] = i
    df_nn_a2['split'] = split
    df_nn = pd.concat([df_nn, df_nn_a2])


df_nn.to_pickle(os.path.join(loc_output,'cv_result_'+dataset+'.pck'))
print('file is exported in ', os.path.join(loc_output,'cv_result_'+dataset+'.pck'))

In [35]:
pd.read_pickle('./models/CV/exper_human/model_result_cv.pck')


Unnamed: 0,B.cell,Macrophage,NK,T.CD4,T.CD8,prediction,ground_truth,design,index_split
0,9.999958e-01,9.452867e-08,3.971469e-07,1.594969e-06,0.000002,B.cell,B.cell,a1,0
1,9.999980e-01,2.505487e-08,1.251975e-07,6.607949e-07,0.000001,B.cell,B.cell,a1,0
2,9.961768e-01,5.064089e-04,1.341920e-05,7.382916e-04,0.002565,B.cell,B.cell,a1,0
3,9.999955e-01,2.200485e-07,1.002529e-06,7.148037e-08,0.000003,B.cell,B.cell,a1,0
4,9.999388e-01,8.349419e-08,9.761348e-08,5.372179e-05,0.000007,B.cell,B.cell,a1,0
...,...,...,...,...,...,...,...,...,...
547,2.781300e-06,1.260625e-06,1.772120e-06,1.605211e-07,0.999994,T.CD8,T.CD8,a1,4
548,2.317980e-04,2.590363e-04,3.909196e-07,2.876044e-02,0.970748,T.CD8,T.CD8,a1,4
549,1.503143e-05,1.101189e-04,1.239591e-04,6.594583e-02,0.933805,T.CD8,T.CD8,a1,4
550,5.320607e-07,6.055379e-08,3.945011e-08,4.884876e-05,0.999951,T.CD8,T.CD8,a1,4
