In [1]:
import sys
import os
from os.path import abspath
import numpy as np
import pandas as pd
from utils.generate_network import generate_network
from utils.prepare_data import prepare_data
from utils.popphy_io import save_params, load_params
from utils.popphy_io import get_stat, get_stat_dict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from models.PopPhy import PopPhyCNN
import warnings
from datetime import datetime
import json
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
#from models.PopPhy2 import ResNet
from models.PopPhy2 import ResNet

# Data Preparation

### Reading Configuration
Configuring which data to read in, minimun threshold needed in an OTU (individual sample must have at least set threshold relative abundance), and how many k folds for k fold cross validation.

In [2]:
dataset = 'T2D'
threshold = 0
k = 5

### Reduce Features
Reduce amount of OTU features by filtering out OTUs that contain no individual sample with a relative abundance greater than the set threshold.

In [3]:
path = "../data/" + dataset
data = pd.read_csv(path + '/abundance.tsv', index_col=0, sep='\t', header=None)
to_drop = data.loc[(data < threshold).all(axis=1)]
data = data.drop(to_drop.index)

data

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,431,432,433,434,435,436,437,438,439,440
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii,0.33364,0.49776,0.0,0.0,0.49446,0.0,0.0,0.0,0.0,0.00000,...,0.0,1.76247,0.0,2.96027,7.44320,0.02598,2.78607,2.46789,6.72433,0.0
k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_unclassified,0.00000,0.12802,0.0,0.0,0.06786,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.07156,0.0
k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera|s__Methanosphaera_stadtmanae,0.00000,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.55541,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0
k__Bacteria|p__Acidobacteria|c__Acidobacteriia|o__Acidobacteriales|f__Acidobacteriaceae|g__Acidobacteriaceae_unclassified,0.00000,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_graevenitzii,0.00000,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.01089,...,0.0,0.00000,0.0,0.00000,0.06781,0.00000,0.00000,0.00000,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
k__Bacteria|p__Planctomycetes|c__Planctomycetia|o__Planctomycetales|f__Planctomycetaceae|g__Rhodopirellula|s__Rhodopirellula_unclassified,0.00000,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Vibrionales|f__Vibrionaceae|g__Vibrio|s__Vibrio_furnissii,0.00000,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_sp_2_2_4,0.00000,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0
k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillaceae|g__Lysinibacillus|s__Lysinibacillus_fusiformis,0.00000,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.00000,0.0,0.00000,0.02423,0.00000,0.00000,0.00000,0.00000,0.0


### Create 2d Matrix Representing OTU Data
Dai et al. PopPhy-CNN's (2019) algorithm creates Phylogenetic tree from OTUs and populates tree based on OTU abundances. This tree graph structure is then converted to a 2d Matrix by taking each parent node in the tree graph and pushing them all to the left and childrens' nodes in the same order from left to right the parents were ordered.

In [4]:
my_maps, raw_x, tree_x, raw_features, tree_features, labels, label_set, g, feature_df = prepare_data(path, data)

# norms = np.linalg.norm(my_maps, axis=2, keepdims=True)
# my_maps = my_maps / norms
pd.DataFrame(my_maps[0])

(606, 440)
There are 606 raw features...
Building tree structure...
Found tree file...
Populating trees...
There are 1054 tree features...


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,386,387,388,389,390,391,392,393,394,395
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.003336,0.996664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.003336,0.0,0.0,0.0,0.02876,0.494275,0.079702,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.003336,0.0,0.0,0.0,0.019336,0.006233,0.0,0.003191,0.494275,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.003336,0.0,0.0,0.0,0.019336,0.0,0.006233,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.003336,0.0,0.0,0.0,0.0,0.0,0.0,0.019336,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.003336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.003336,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.003336,0.0,0.0,0.0,0.0,0.00126,0.017147,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024597,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Creating training and test sets
Splitting data into k training and k test sets

In [5]:
#one hot encoding
input = my_maps
target = tf.keras.utils.to_categorical(labels, 2, dtype='int64')
    

#shuffle dataset
seed = np.random.randint(100)
# np.random.seed(seed)
# np.random.shuffle(input)
np.random.seed(seed)
np.random.shuffle(target)

np.random.seed(seed)
np.random.shuffle(my_maps)
np.random.seed(seed)
np.random.shuffle(raw_x)
np.random.seed(seed)
np.random.shuffle(tree_x)
np.random.seed(seed)
np.random.shuffle(labels)


#create k training and k test sets
groups_input = []
groups_target = []
k_size = len(input)//k
start, end = 0, k_size
for i in range(k):
    if i == k-1:
        group_input = input[start:]
        group_target = target[start:]
    else:
        group_input = input[start:end]
        group_target = target[start:end]
    start += k_size
    end += k_size
    groups_input.append(group_input)
    groups_target.append(group_target)

x_train = []
y_train = []
x_test = []
y_test = []
for i in range(k-1, -1, -1):
    x_train.append(np.concatenate((groups_input[i-1], groups_input[i-2], groups_input[i-3], groups_input[i-4])))
    y_train.append(np.concatenate((groups_target[i-1], groups_target[i-2], groups_target[i-3], groups_target[i-4])))

    x_test.append(groups_input[i])
    y_test.append(groups_target[i])

# Model

### Training model
Data is log transformed and then a MinMax transformation. Uses CNN that employs skipped residual identity blocks borrowed from the classic ResNet model then a FC Neural Network to make phenotype prediction. Model dimensions printed below.

In [6]:
data_lst = []

# for i in range(k):
#     x_train1 = x_train[i]
#     y_train1 = y_train[i]
#     x_test1 = x_test[i]
#     y_test1 = y_test[i]

#     model = ResNet(height = x_train1.shape[1], width = x_train1.shape[2], channels = 1, classes = 2)
#     model.init_model()

#     model.train(x_train1, y_train1, x_test1, y_test1, dataset, use_weights = False)
#     y_pred = model.predict(x_test1)
#     auc_roc, auc_pr, f1, mcc = model.evaluate(y_test1, y_pred)
#     data_lst.append([auc_roc, auc_pr, f1, mcc])
    
#     #model.model.save_weights(path + "/model_weights.h5")

#     print(y_test1)
#     print(y_pred)
    
# print(model.model.summary())



n_values = np.max(labels) + 1
labels_oh = np.eye(n_values)[labels]
tree_row = my_maps.shape[1]
tree_col = my_maps.shape[2]

skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
fold = 0
for train_index, test_index in skf.split(my_maps, labels):
    train_x, test_x = my_maps[train_index,:,:], my_maps[test_index,:,:]
    train_y, test_y = labels_oh[train_index,:], labels_oh[test_index,:]
        
    train_x = np.log(train_x + 1)
    test_x = np.log(test_x + 1)
        
    c_prob = [0] * len(np.unique(labels))
    train_weights = []

    for l in np.unique(labels):
        a = float(len(labels))
        b = 2.0 * float((np.sum(labels==l)))
        c_prob[int(l)] = a/b

    c_prob = np.array(c_prob).reshape(-1)

    for l in np.argmax(train_y, 1):
        train_weights.append(c_prob[int(l)])
    train_weights = np.array(train_weights)
        
    scaler = MinMaxScaler().fit(train_x.reshape(-1, tree_row * tree_col))
    train_x = np.clip(scaler.transform(train_x.reshape(-1, tree_row * tree_col)), 0, 1).reshape(-1, tree_row, tree_col)
    test_x = np.clip(scaler.transform(test_x.reshape(-1, tree_row * tree_col)), 0, 1).reshape(-1, tree_row, tree_col)

    train = [train_x, train_y]
    test = [test_x, test_y]

    x_train1 = train_x
    y_train1 = train_y
    x_test1 = test_x
    y_test1 = test_y
        
#         y_train1 = train_y
#         y_test1 = test_y
        
#         x_train1 = np.zeros(train_x.shape)
#         x_train1[train_x != 0] = 1
        
#         x_test1 = np.zeros(test_x.shape)
#         x_test1[test_x != 0] = 1
        
        # for i in range(len(train_x)):
        #     for j in range(len(test_x)):
        #         if np.array_equal(train_x[i], test_x[j]):
        #             print('train')
        #             print(train_x[i])
        #             print('test')
        #             print(test_x[j])
        
        
    model = ResNet(height = train_x.shape[1], width = train_x.shape[2], channels = 1, classes = 2)
    model.init_model()
    model.train(train_x, train_y, test_x, y_test1, dataset, use_weights = False)
    y_pred = model.predict(test_x)
    auc_roc, auc_pr, f1, mcc = model.evaluate(test_y, y_pred)
    data_lst.append([auc_roc, auc_pr, f1, mcc])
    #model.model.save_weights(path + "/model_weights.h5")
    print(test_y)
    print(y_pred)
    print(model.model.summary())
    
    fold += 1
#run += 1
        
        
        
        

Metal device set to: Apple M1 Pro
Epoch 1/65
Epoch 2/65
Epoch 3/65
Epoch 4/65
Epoch 5/65
Epoch 6/65
Epoch 7/65
Epoch 8/65
Epoch 9/65
Epoch 10/65
Epoch 11/65
Epoch 12/65
Epoch 13/65
Epoch 14/65
Epoch 15/65
Epoch 16/65
Epoch 17/65
Epoch 18/65
Epoch 19/65
Epoch 20/65
Epoch 21/65
Epoch 22/65
Epoch 23/65
Epoch 24/65
Epoch 25/65
Epoch 26/65
Epoch 27/65
Epoch 28/65
Epoch 29/65
Epoch 30/65
Epoch 31/65
Epoch 32/65
Epoch 33/65
Epoch 34/65
Epoch 35/65
Epoch 36/65
Epoch 37/65
Epoch 38/65
Epoch 39/65
Epoch 40/65
Epoch 41/65
Epoch 42/65
Epoch 43/65
Epoch 44/65
Epoch 45/65
Epoch 46/65
Epoch 47/65
Epoch 48/65
Epoch 49/65
Epoch 50/65
Epoch 51/65
Epoch 52/65
Epoch 53/65
Epoch 54/65
Epoch 55/65
Epoch 56/65
Epoch 57/65
Epoch 58/65
Epoch 59/65
Epoch 60/65
Epoch 61/65
Epoch 62/65
Epoch 63/65
Epoch 64/65
Epoch 65/65
auc_roc: 0.6973140495867769
auc_pr: 0.6931659060934845
f1_score: 0.6204851752021563
mcc: 0.29888128883203074
[[0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 

# Displaying Accuracy Metrics and Saving Metrics

Option to save results of all k folds and weights of last model into same directy as data.

In [7]:
col = [str(i) for i in range(1,k+1)]    
results_df = pd.DataFrame(data_lst, columns = ['auc(roc)', 'auc(pr)', 'f1', 'mcc'])
results_df = results_df.transpose()
results_df.columns = col

#results_df.to_csv(path + "/results.csv")
results_df

Unnamed: 0,1,2,3,4,5
auc(roc),0.697314,0.648244,0.521447,0.69354,0.66615
auc(pr),0.693166,0.6487,0.540659,0.672859,0.649952
f1,0.620485,0.631606,0.511972,0.454052,0.472341
mcc,0.298881,0.280056,0.092727,0.18653,0.072041


# Saving Model Weights

Option to save model weights of last model in k fold into same directy as data.

In [8]:
#model.model.save_weights(path + "/model_weights.h5")

## 