In [1]:
import sys
import os
from os.path import abspath
import numpy as np
import pandas as pd
from utils.generate_network import generate_network
from utils.prepare_data import prepare_data
from utils.popphy_io import save_params, load_params
from utils.popphy_io import get_stat, get_stat_dict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from models.PopPhy import PopPhyCNN
import warnings
from datetime import datetime
import json
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
#from models.PopPhy2 import ResNet
from models.PopPhy2 import ResNet

# Data Preparation

### Reading Configuration
Configuring which data to read in, minimun threshold needed in an OTU (individual sample must have at least set threshold relative abundance), and how many k folds for k fold cross validation.

In [2]:
dataset = 'SKG-gender'
threshold = 0
k = 5

### Reduce Features
Reduce amount of OTU features by filtering out OTUs that contain no individual sample with a relative abundance greater than the set threshold.

In [3]:
path = "../data/" + dataset
data = pd.read_csv(path + '/abundance.tsv', index_col=0, sep='\t', header=None)
to_drop = data.loc[(data < threshold).all(axis=1)]
data = data.drop(to_drop.index)

data

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,54,55,56,57,58,59,60,61,62,63
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_acidifaciens,0.515464,6.118415,2.770585,0.000000,7.597435,4.111809,7.151454,2.182638,8.290361,12.971185,...,1.560158,1.123329,6.644094,3.491985,3.254792,3.368070,12.787513,7.915906,1.973158,1.963207
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|g__unclassified|s__unclassified_unclassified,0.000000,0.000000,0.233040,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.226549,0.000000,11.924920,0.000000,0.017392,0.013520,0.000000,0.000000
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_murinus,4.329897,2.217066,8.234076,3.771930,4.970400,3.011275,5.789574,4.183847,0.238001,0.679028,...,3.192493,2.626374,15.851847,4.063979,6.629393,10.571551,4.000174,4.873927,1.402706,20.356582
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_johnsonii,0.220913,1.160093,2.382185,6.871345,1.541687,0.364594,0.190030,0.142944,2.062674,1.314529,...,4.888679,12.281465,2.992088,2.397430,1.717252,6.145251,5.048046,16.656527,3.348307,3.282027
k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Rikenellaceae|g__Alistipes|s__Alistipes_unclassified,0.677467,0.910888,0.310720,5.453216,0.641342,0.492877,0.620764,0.000000,0.515668,0.348220,...,2.695575,2.048889,0.625367,0.459007,3.985623,2.086914,0.400017,0.189279,0.509824,0.303830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
k__Bacteria|p__Firmicutes|c__Clostridia|o__Lachnospirales|f__Lachnospiraceae|g__Lachnospiraceae NK4A136 group|s__Lachnospiraceae NK4A136 group_unclassified,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.139288,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridia UCG-014|f__unclassified|g__unclassified|s__unclassified_unclassified,0.000000,0.042966,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridia UCG-014|f__unclassified|g__unclassified|s__unclassified_unclassified,0.000000,0.000000,0.000000,0.000000,0.185002,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
k__Bacteria|p__Firmicutes|c__Clostridia|o__Lachnospirales|f__Lachnospiraceae|g__unclassified|s__unclassified_unclassified,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### Create 2d Matrix Representing OTU Data
Dai et al. PopPhy-CNN's (2019) algorithm creates Phylogenetic tree from OTUs and populates tree based on OTU abundances. This tree graph structure is then converted to a 2d Matrix by taking each parent node in the tree graph and pushing them all to the left and childrens' nodes in the same order from left to right the parents were ordered.

In [4]:
my_maps, raw_x, tree_x, raw_features, tree_features, labels, label_set, g, feature_df = prepare_data(path, data)

pd.DataFrame(my_maps[0])

(520, 63)
There are 520 raw features...
Building tree structure...
Tree file not found...
Constructing tree..
Pruning Tree...
Populating trees...
There are 275 tree features...


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,447,448,449,450,451,452,453,454,455,456
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.014884,0.357206,0.046124,0.001916,0.078691,0.498821,0.002358,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.014884,0.100648,0.03183,0.0,0.224727,0.046124,0.0,0.001916,0.078691,0.465959,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.014884,0.006189,0.094459,0.03183,0.0,0.0,0.0,0.224727,0.046124,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.014884,0.006189,0.094459,0.000295,0.031536,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.014884,0.006189,0.094459,0.000295,0.031536,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.014884,0.0,0.006189,0.0,0.0,0.0,0.0,0.094459,0.000295,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.004568,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.005158,0.001474,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Creating training and test sets
Splitting data into k training and k test sets

In [5]:
#one hot encoding
input = my_maps
target = tf.keras.utils.to_categorical(labels, 2, dtype='int64')
    

#shuffle dataset
seed = np.random.randint(100)
np.random.seed(seed)
np.random.shuffle(input)
np.random.seed(seed)
np.random.shuffle(target)
np.random.seed(seed)
np.random.shuffle(labels)

#create k training and k test sets
groups_input = []
groups_target = []
k_size = len(input)//k
start, end = 0, k_size
for i in range(k):
    if i == k-1:
        group_input = input[start:]
        group_target = target[start:]
    else:
        group_input = input[start:end]
        group_target = target[start:end]
    start += k_size
    end += k_size
    groups_input.append(group_input)
    groups_target.append(group_target)

x_train = []
y_train = []
x_test = []
y_test = []
for i in range(k-1, -1, -1):
    x_train.append(np.concatenate((groups_input[i-1], groups_input[i-2], groups_input[i-3], groups_input[i-4])))
    y_train.append(np.concatenate((groups_target[i-1], groups_target[i-2], groups_target[i-3], groups_target[i-4])))

    x_test.append(groups_input[i])
    y_test.append(groups_target[i])

# Model

### Training model
Uses ConvNet that employs skipped residual identity blocks borrowed from the classic ResNet model to...

In [6]:
data_lst = []

# for i in range(k):
#     x_train1 = x_train[i]
#     y_train1 = y_train[i]
#     x_test1 = x_test[i]
#     y_test1 = y_test[i]

#     model = ResNet(height = x_train1.shape[1], width = x_train1.shape[2], channels = 1, classes = 2)
#     model.init_model()

#     model.train(x_train1, y_train1, x_test1, y_test1, dataset, use_weights = False)
#     y_pred = model.predict(x_test1)
#     auc_roc, auc_pr, f1, mcc = model.evaluate(y_test1, y_pred)
#     data_lst.append([auc_roc, auc_pr, f1, mcc])
    
#     #model.model.save_weights(path + "/model_weights.h5")

#     print(y_test1)
#     print(y_pred)
    
# print(model.model.summary())



n_values = np.max(labels) + 1
labels_oh = np.eye(n_values)[labels]
tree_row = my_maps.shape[1]
tree_col = my_maps.shape[2]

skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
fold = 0
for train_index, test_index in skf.split(my_maps, labels):
    train_x, test_x = my_maps[train_index,:,:], my_maps[test_index,:,:]
    train_y, test_y = labels_oh[train_index,:], labels_oh[test_index,:]
        
    train_x = np.log(train_x + 1)
    test_x = np.log(test_x + 1)
        
    c_prob = [0] * len(np.unique(labels))
    train_weights = []

    for l in np.unique(labels):
        a = float(len(labels))
        b = 2.0 * float((np.sum(labels==l)))
        c_prob[int(l)] = a/b

    c_prob = np.array(c_prob).reshape(-1)

    for l in np.argmax(train_y, 1):
        train_weights.append(c_prob[int(l)])
    train_weights = np.array(train_weights)
        
    scaler = MinMaxScaler().fit(train_x.reshape(-1, tree_row * tree_col))
    train_x = np.clip(scaler.transform(train_x.reshape(-1, tree_row * tree_col)), 0, 1).reshape(-1, tree_row, tree_col)
    test_x = np.clip(scaler.transform(test_x.reshape(-1, tree_row * tree_col)), 0, 1).reshape(-1, tree_row, tree_col)

    train = [train_x, train_y]
    test = [test_x, test_y]

    #popphy_model = PopPhyCNN((tree_row, tree_col), num_class, config)

    x_train1 = train_x
    y_train1 = train_y
    x_test1 = test_x
    y_test1 = test_y
        
#         y_train1 = train_y
#         y_test1 = test_y
        
#         x_train1 = np.zeros(train_x.shape)
#         x_train1[train_x != 0] = 1
        
#         x_test1 = np.zeros(test_x.shape)
#         x_test1[test_x != 0] = 1
        
        # for i in range(len(train_x)):
        #     for j in range(len(test_x)):
        #         if np.array_equal(train_x[i], test_x[j]):
        #             print('train')
        #             print(train_x[i])
        #             print('test')
        #             print(test_x[j])
        
        
    model = ResNet(height = x_train1.shape[1], width = x_train1.shape[2], channels = 1, classes = 2)
    model.init_model()
    model.train(x_train1, y_train1, x_test1, y_test1, dataset, use_weights = False)
    y_pred = model.predict(x_test1)
    auc_roc, auc_pr, f1, mcc = model.evaluate(y_test1, y_pred)
    data_lst.append([auc_roc, auc_pr, f1, mcc])
    #model.model.save_weights(path + "/model_weights.h5")
    print(y_test1)
    print(y_pred)
    print(model.model.summary())
        
        
        
        

Metal device set to: Apple M1 Pro
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80
auc_roc: 0.9722

# Displaying Accuracy Metrics and Saving Metrics

Option to save results of all k folds and weights of last model into same directy as data.

In [7]:
col = [str(i) for i in range(1,k+1)]    
results_df = pd.DataFrame(data_lst, columns = ['auc(roc)', 'auc(pr)', 'f1', 'mcc'])
results_df = results_df.transpose()
results_df.columns = col

#results_df.to_csv(path + "/results.csv")
results_df

Unnamed: 0,1,2,3,4,5
auc(roc),0.972222,0.975,0.875,0.96875,0.90625
auc(pr),0.969444,0.976389,0.903398,0.968056,0.922743
f1,0.521978,0.924009,0.695971,0.913165,0.568765
mcc,0.365148,0.853913,0.414758,0.816497,0.408248


# Saving Model Weights

Option to save model weights of last model in k fold into same directy as data.

In [8]:
#model.model.save_weights(path + "/model_weights.h5")

## 