In [1]:
import sys
import os
from os.path import abspath
import numpy as np
import pandas as pd
from utils.generate_network import generate_network
from utils.prepare_data import prepare_data
from utils.popphy_io import save_params, load_params
from utils.popphy_io import get_stat, get_stat_dict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from models.PopPhy import PopPhyCNN
import warnings
from datetime import datetime
import json
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
#from models.PopPhy2 import ResNet
from models.Siamese import ResNet

# Data Preparation

### Reading Configuration
Configuring which data to read in, minimun threshold needed in an OTU (individual sample must have at least set threshold proportion), and how many folds for k fold cross validation.

In [2]:
dataset = 'SKG-abx'
threshold = 0.1
k = 5

### Reduce Features
Reduce amount of OTU features by filtering out OTUs that contain no individual sample with a relative abundance greater than the set threshold.

In [3]:
path = "../data/" + dataset
data = pd.read_csv(path + '/abundance.tsv', index_col=0, sep='\t', header=None)
to_drop = data.loc[(data < threshold).all(axis=1)]
data = data.drop(to_drop.index)

data

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,118,119,120,121,122,123,124,125,126,127
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_acidifaciens,35,712,107,0,616,609,1129,397,209,1490,...,2941,1171,1891,716,588,26567,18283,2126,554,20731
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|g__unclassified|s__unclassified_unclassified,0,0,9,0,0,0,0,0,0,0,...,4,2,0,0,0,9049,11919,0,0,11064
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_murinus,294,258,318,258,403,446,914,761,6,78,...,920,721,437,509,6097,2,0,1543,1333,0
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_johnsonii,15,135,92,470,125,54,30,26,52,151,...,1161,2464,50,1215,983,0,0,73,321,0
k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Rikenellaceae|g__Alistipes|s__Alistipes_unclassified,46,106,12,373,52,73,98,0,13,40,...,92,28,242,185,91,0,0,277,997,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_unclassified,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridia UCG-014|f__unclassified|g__unclassified|s__unclassified_unclassified,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridia vadinBB60 group|f__unclassified|g__unclassified|s__unclassified_unclassified,0,0,0,0,0,0,0,0,0,0,...,6,0,0,0,0,0,0,0,0,0
k__Bacteria|p__Desulfobacterota|c__Desulfovibrionia|o__Desulfovibrionales|f__Desulfovibrionaceae|g__Desulfovibrio|s__Desulfovibrio_unclassified,0,0,0,0,0,0,0,0,0,0,...,0,0,0,12,0,0,0,0,0,0


### Create 2d Matrix Representing OTU Data
Dai et al. PopPhy-CNN's (2019) algorithm creates Phylogenetic tree from OTUs and populates tree based on OTU abundances. This tree graph structure is then converted to a 2d Matrix by taking each parent node in the tree graph and pushing them all to the left and childrens' nodes in the same order from left to right the parents were ordered.

In [4]:
my_maps, raw_x, tree_x, raw_features, tree_features, labels, label_set, g, feature_df = prepare_data(path, data)

pd.DataFrame(my_maps[0])

(692, 127)
There are 692 raw features...
Building tree structure...
Tree file not found...
Constructing tree..
Pruning Tree...
Populating trees...
There are 337 tree features...


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.014875,0.357585,0.046097,0.001915,0.078645,0.498527,0.002356,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.014875,0.100589,0.031811,0.0,0.225184,0.046097,0.0,0.001915,0.078645,0.465685,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.014875,0.006186,0.094404,0.031811,0.0,0.0,0.0,0.0,0.0,0.225184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.014875,0.006186,0.094404,0.000295,0.031517,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.014875,0.006186,0.094404,0.000295,0.031517,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.014875,0.0,0.006186,0.0,0.0,0.0,0.0,0.094404,0.000295,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001473,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Creating training and test sets
Splitting data into k training and k test sets

In [5]:
#one hot encoding
input = my_maps
target = tf.keras.utils.to_categorical(labels, 2, dtype='int64')
    

#shuffle dataset
seed = np.random.randint(100)
np.random.seed(seed)
np.random.shuffle(input)
np.random.seed(seed)
np.random.shuffle(target)

#create k training and k test sets
groups_input = []
groups_target = []
k_size = len(input)//k
start, end = 0, k_size
for i in range(k):
    if i == k-1:
        group_input = input[start:]
        group_target = target[start:]
    else:
        group_input = input[start:end]
        group_target = target[start:end]
    start += k_size
    end += k_size
    groups_input.append(group_input)
    groups_target.append(group_target)

x_train = []
y_train = []
x_test = []
y_test = []
for i in range(k-1, -1, -1):
    x_train.append(np.concatenate((groups_input[i-1], groups_input[i-2], groups_input[i-3], groups_input[i-4])))
    y_train.append(np.concatenate((groups_target[i-1], groups_target[i-2], groups_target[i-3], groups_target[i-4])))

    x_test.append(groups_input[i])
    y_test.append(groups_target[i])

# Model

### Training model
Uses ConvNet that employs skipped residual identity blocks borrowed from the classic ResNet model to...

In [6]:
data_lst = []

for i in range(k):
    x_train1 = x_train[i]
    y_train1 = y_train[i]
    x_test1 = x_test[i]
    y_test1 = y_test[i]

    model = ResNet(height = x_train1.shape[1], width = x_train1.shape[2], channels = 1, classes = 2)
    model.init_model()

    model.train(x_train1, y_train1, x_test1, y_test1, dataset, use_weights = False)
    y_pred = model.predict(x_test1)
    auc_roc, auc_pr, f1, mcc = model.evaluate(y_test1, y_pred)
    data_lst.append([auc_roc, auc_pr, f1, mcc])
    
    #model.model.save_weights(path + "/model_weights.h5")

    print(y_test1)
    print(y_pred)
    
print(model.model.summary())

Metal device set to: Apple M1 Pro
Epoch 1/65
Epoch 2/65
Epoch 3/65
Epoch 4/65
Epoch 5/65
Epoch 6/65
Epoch 7/65
Epoch 8/65
Epoch 9/65
Epoch 10/65
Epoch 11/65
Epoch 12/65
Epoch 13/65
Epoch 14/65
Epoch 15/65
Epoch 16/65
Epoch 17/65
Epoch 18/65
Epoch 19/65
Epoch 20/65
Epoch 21/65
Epoch 22/65
Epoch 23/65
Epoch 24/65
Epoch 25/65
Epoch 26/65
Epoch 27/65
Epoch 28/65
Epoch 29/65
Epoch 30/65
Epoch 31/65
Epoch 32/65
Epoch 33/65
Epoch 34/65
Epoch 35/65
Epoch 36/65
Epoch 37/65
Epoch 38/65
Epoch 39/65
Epoch 40/65
Epoch 41/65
Epoch 42/65
Epoch 43/65
Epoch 44/65
Epoch 45/65
Epoch 46/65
Epoch 47/65
Epoch 48/65
Epoch 49/65
Epoch 50/65
Epoch 51/65
Epoch 52/65
Epoch 53/65
Epoch 54/65
Epoch 55/65
Epoch 56/65
Epoch 57/65
Epoch 58/65
Epoch 59/65
Epoch 60/65
Epoch 61/65
Epoch 62/65
Epoch 63/65
Epoch 64/65
Epoch 65/65
auc_roc: 0.9444444444444444
auc_pr: 0.9582166947975773
f1_score: 0.16666666666666666
mcc: 0.0
[[0 1]
 [0 1]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [1 0]
 [1 0]
 [0 1]
 [1 0]
 [0

# Saving Metrics

Option to save results of all k folds and weights of last model into same directy as data.

In [7]:
col = [str(i) for i in range(1,k+1)]    
results_df = pd.DataFrame(data_lst, columns = ['auc(roc)', 'auc(pr)', 'f1', 'mcc'])
results_df = results_df.transpose()
results_df.columns = col

#results_df.to_csv(path + "/results.csv")
results_df

Unnamed: 0,1,2,3,4,5
auc(roc),0.944444,0.883333,0.853333,0.547794,0.785088
auc(pr),0.958217,0.86775,0.847373,0.585223,0.729551
f1,0.166667,0.310294,0.228571,0.155152,0.665668
mcc,0.0,0.166667,0.0,0.0,0.256606


In [8]:
#model.model.save_weights(path + "/model_weights.h5")


## 