In [1]:
# import VAE from other script
import os
from MicrobeVAE import VariationalAutoencoder

# import necessary packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# disable eager execution
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [2]:
# load fracking data and make necessary changes to shape
combined_metadata = pd.read_csv(r"C:\Users\Peter\Desktop\Classes\Summer 2021\CSI Research\edited_mt_combined_metadata.txt", sep = "\t")
species_table = pd.read_csv(r"C:\Users\Peter\Desktop\Classes\Summer 2021\CSI Research\norm_filtered_species_table.txt", sep = "\t")

# set metadata index to sample ID
combined_metadata = combined_metadata.set_index("SampleID")

# set species table index to Sample ID
species_table = species_table.set_index("SampleID")

# transpose species table
species_table = species_table.transpose()

# merge metadata and species data
merged_data = pd.merge(species_table, combined_metadata, left_index=True, right_index=True)
merged_data.head()

Unnamed: 0,d__Archaea;p__Candidatus_Korarchaeota;g__Candidatus_Korarchaeum;s__Candidatus_Korarchaeum_cryptofilum,d__Archaea;p__Candidatus_Lokiarchaeota;g__Candidatus_Prometheoarchaeum;s__Candidatus_Prometheoarchaeum_syntrophicum,d__Archaea;p__Candidatus_Micrarchaeota;g__Candidatus_Mancarchaeum;s__Candidatus_Mancarchaeum_acidiphilum,d__Archaea;p__Candidatus_Micrarchaeota;g__Candidatus_Micrarchaeum;s__Candidatus_Micrarchaeum_sp.,d__Archaea;p__Candidatus_Thermoplasmatota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Candidatus_Methanomethylophilaceae;g__Candidatus_Methanomethylophilus;s__Candidatus_Methanomethylophilus_alvus,d__Archaea;p__Candidatus_Thermoplasmatota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Methanomassiliicoccaceae;g__Candidatus_Methanoplasma;s__Candidatus_Methanoplasma_termitum,d__Archaea;p__Candidatus_Thermoplasmatota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Methanomassiliicoccaceae;g__Methanomassiliicoccus;s__Candidatus_Methanomassiliicoccus_intestinalis,d__Archaea;p__Candidatus_Thermoplasmatota;c__Thermoplasmata;o__Thermoplasmatales;f__Cuniculiplasmataceae;g__Cuniculiplasma;s__Cuniculiplasma_divulgatum,d__Archaea;p__Candidatus_Thermoplasmatota;c__Thermoplasmata;o__Thermoplasmatales;f__Ferroplasmaceae;g__Ferroplasma;s__Ferroplasma_acidarmanus,d__Archaea;p__Candidatus_Thermoplasmatota;c__Thermoplasmata;o__Thermoplasmatales;f__Ferroplasmaceae;g__Ferroplasma;s__Ferroplasma_acidiphilum,...,Lat,Long,Date,HF_Status,Year,R1File,Matrix,HF_Status_Matrix,Year_Matrix,name
AB1_S65_L006,0.850561,109.297,21.264,28.0685,155.227,57.4129,253.467,56.1371,0.0,0.0,...,,,,HF+,Y2019,/home/see/Wright_Labs/May_2021/Fracking_data_s...,Sediment,Sediment_HF+,Sediment_Y2019,Alex.Branch.Sediment.1
AB3_S66_L006,2.26684,121.049,35.3627,23.1217,178.627,63.9248,301.489,68.0051,0.0,0.0,...,,,,HF+,Y2019,/home/see/Wright_Labs/May_2021/Fracking_data_s...,Sediment,Sediment_HF+,Sediment_Y2019,Alex.Branch.Sediment.3
BCD1_S67_L006,2.96983,223.727,22.7687,1.97989,116.813,22.2738,85.1352,7.42458,0.0,0.0,...,,,,HF+,Y2019,/home/see/Wright_Labs/May_2021/Fracking_data_s...,Sediment,Sediment_HF+,Sediment_Y2019,Black.Creek.Downstream.Sediment.1
BCD2_S68_L006,2.2124,110.62,57.9649,12.8319,135.841,146.461,336.285,39.3807,0.0,0.0,...,,,,HF+,Y2019,/home/see/Wright_Labs/May_2021/Fracking_data_s...,Sediment,Sediment_HF+,Sediment_Y2019,Black.Creek.Downstream.Sediment.2
BCU1_S69_L006,7.24569,227.204,17.0791,13.4563,64.6937,63.141,534.629,86.4308,0.0,0.0,...,,,,HF-,Y2019,/home/see/Wright_Labs/May_2021/Fracking_data_s...,Sediment,Sediment_HF-,Sediment_Y2019,Black.Creek.Upstream.Sediment.1


In [3]:
# define X (input): remove metadata tags
X = merged_data.iloc[:, :-17]

# define y (output): Select only HF status
y = merged_data.loc[:, "HF_Status"]
y = np.where(y == "HF+", 1, 0)

# convert y to float32 so true values and preds are same data type, then convert to keras tensor
y = y.astype('float32')

print(X.shape, y.shape)

(99, 8218) (99,)


In [4]:
# split X and y into train and test sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(79, 8218) (20, 8218) (79,) (20,)


In [5]:
# create the VAE
vae = VariationalAutoencoder(
    input_dim = (X_train.shape[1],),
    encoder_layer_size = [2048, 1024, 512],
    decoder_layer_size = [512, 1024, 2048, X_train.shape[1]],
    z_dim = 256)

In [6]:
# encoder summary
vae.encoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, 8218)]       0                                            
__________________________________________________________________________________________________
encoder_dense_0 (Dense)         (None, 2048)         16832512    encoder_input[0][0]              
__________________________________________________________________________________________________
leaky_re_lu (LeakyReLU)         (None, 2048)         0           encoder_dense_0[0][0]            
__________________________________________________________________________________________________
encoder_dense_1 (Dense)         (None, 1024)         2098176     leaky_re_lu[0][0]                
____________________________________________________________________________________________

In [7]:
# decoder summary
vae.decoder.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
decoder_input (InputLayer)   [(None, 256)]             0         
_________________________________________________________________
decoder_dense_0 (Dense)      (None, 512)               131584    
_________________________________________________________________
leaky_re_lu_3 (LeakyReLU)    (None, 512)               0         
_________________________________________________________________
decoder_dense_1 (Dense)      (None, 1024)              525312    
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 1024)              0         
_________________________________________________________________
decoder_dense_2 (Dense)      (None, 2048)              2099200   
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU)    (None, 2048)              0   

In [8]:
## TRAIN and COMPILE the VAE
lr = 0.0005
r_loss_factor = 10000
epochs = 1000
batch_size = 16

In [9]:
vae.compile(learning_rate = lr, r_loss_factor = r_loss_factor)

In [10]:
vae.train(x_train = X_train, y_train = X_train,
         epochs = epochs,
         batch_size = batch_size)

Train on 66 samples, validate on 13 samples
Epoch 1/1000
Epoch 2/1000



Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000

KeyboardInterrupt: 