In [1]:
## IMPORT NECESSARY PACKAGES
from NeuralNetConstructor import ANN

import numpy as np
from keras.utils import np_utils
from numpy.random import randn
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.python.keras import regularizers
from tensorflow.keras.constraints import max_norm
import tensorflow.keras.backend as K

from sklearn.model_selection import train_test_split

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
for device in physical_devices:
    tf.config.experimental.set_memory_growth(device, True)

In [2]:
## DATA PREPROCESSING
data = pd.read_csv(r"C:\Users\Peter\Desktop\Classes\Summer 2021\CSI Research\walnutnn.txt")
data = data.set_index("SampleID")
data.shape

(121, 6165)

In [3]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

In [4]:
# remove samples w/ fewer than 100 reads
row_drop = []

for i in range(len(X)):
    row_sum = X.iloc[i].sum()
    if row_sum < 100:
        row_drop.append(i)

X.drop(labels = row_drop, axis = 0, inplace = True)

# remove OTUs with fewer than 10 reads
col_drop = []

for i in range(X.shape[1]):
    col_sum = X.iloc[:, i].sum()
    if col_sum < 10:
        col_drop.append(i)
        
X.drop((X.columns[i] for i in col_drop), axis = 1, inplace = True)

# remove OTUs present in fewer than 1% of samples
otu_col_drop = []

for j in range(X.shape[1]):
    non_zero_reads = 0
    for i in range(X.shape[0]):
        if X.iloc[i, j] > 0:
            non_zero_reads += 1
    presence_pct = non_zero_reads / X.shape[0]
    if presence_pct < .01:
        otu_col_drop.append(i)

X.drop((X.columns[i] for i in otu_col_drop), axis = 1, inplace = True)
X = np.array(X)

X.shape

(121, 5950)

In [18]:
# label -> one-hot encoding
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
enc = LabelEncoder()
y = enc.fit_transform(y)

# convert integers to dummy variables (i.e. one hot encoded)
y = np_utils.to_categorical(y)

ValueError: y should be a 1d array, got an array of shape (121, 4) instead.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)
y_train.shape

(96, 4)

In [13]:
# initialize artificial neural network
walnut_net = ANN(input_dim = X_train.shape[1], #input dimension
                   neurons = [1000, 1000, 500, 250], #number of neurons in each hidden layer
                   activation = "relu", #activation for each layer
                   output_activation = "softmax", #activation for output
                   output_neurons = 4, #number of neurons in output (1 for binary classification)
                   batch_size = 32, 
                   epochs = 1000,
                   use_batch_norm = True,
                   use_dropout = True,
                   dropout_rate = .5)

In [14]:
walnut_net.model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 5950)]            0         
_________________________________________________________________
batch_normalization_5 (Batch (None, 5950)              23800     
_________________________________________________________________
layer_0 (Dense)              (None, 1000)              5951000   
_________________________________________________________________
dropout_4 (Dropout)          (None, 1000)              0         
_________________________________________________________________
batch_normalization_6 (Batch (None, 1000)              4000      
_________________________________________________________________
layer_1 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dropout_5 (Dropout)          (None, 1000)              0   

In [15]:
walnut_net.compile()

In [16]:
walnut_net.train(X_train, y_train)

Train Loss: 20.5612 
 Train AUC: 0.9152 
 Train Accuracy: 86.4583%


In [17]:
walnut_net.test(X_test, y_test)

Test Loss: 134.3229 
 Test AUC: 0.4733 
 Test Accuracy: 20.0000%
