In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import tensorflow as tf
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler


#os datasets escolhidos foram THROMBIN_training/test
#os datasets foram carregados a partir da módulos Pandas e convertidos em Dataframe

path_train  = "./THROMBIN_training_disguised.csv"
path_test = "./THROMBIN_test_disguised.csv"

dataset_training = pd.read_csv(path_train)
dataset_test = pd.read_csv(path_test)


#foram selecionadas as colunas em comum dos datasets de training e test de forma a que ambos contivessem o mesmo tipo de informação

common_cols = list(set(dataset_training.columns) & set(dataset_test.columns))
dataset_training_eq, dataset_test_eq = dataset_training[common_cols], dataset_test[common_cols]

dataset_training_eq.set_index('MOLECULE', inplace=True)
dataset_test_eq.set_index('MOLECULE', inplace=True)



In [2]:
# --> Pré-processamento dos dados

# --> Remoção dos valores omissos

imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imp.fit(dataset_training_eq)
training = pd.DataFrame(imp.transform(dataset_training_eq), columns = dataset_training_eq.columns, index = dataset_training_eq.index)
print(training.head)



<bound method NDFrame.head of           D_3775  D_6908  D_1108  D_7420  D_1931  D_3564  D_4429  D_8277  \
MOLECULE                                                                   
M_1          0.0     0.0     5.0     0.0     0.0     0.0     0.0     0.0   
M_2          0.0     0.0     7.0     0.0     0.0     0.0     0.0     0.0   
M_3          0.0     0.0     3.0     0.0     0.0     0.0     0.0     0.0   
M_5          0.0     0.0     5.0     0.0     0.0     0.0     0.0     0.0   
M_6          0.0     0.0     5.0     0.0     0.0     0.0     0.0     0.0   
M_8          0.0     0.0     3.0     0.0     0.0     0.0     0.0     0.0   
M_9          0.0     0.0     4.0     0.0     0.0     0.0     0.0     0.0   
M_10         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
M_11         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
M_14         0.0     0.0     6.0     0.0     0.0     0.0     0.0     0.0   
M_343        0.0     0.0     3.0     0.0     0.0     0.0  

In [3]:
# --> Holdout 
# divisão da amostra em dados de treino e dados de validação
# 20% dos dados foram para o teste, e 80% para o treino


training = training.values #transformação do dataframe num array

n = training.shape[0]
train_start = 0
train_end = int(np.floor(0.8*n))
test_start = train_end + 1
test_end = n
data_train = training[np.arange(train_start, train_end),:]
data_test = training[np.arange(test_start, test_end),:]


In [4]:
# -->Escalonamento de features 
# de acordo com o minimo e máximo absoluto

scaler = MinMaxScaler().fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)
print(data_train)
print(data_test)

[[0.         0.         0.27777778 ... 0.         0.         0.        ]
 [0.         0.         0.38888889 ... 0.         0.         0.        ]
 [0.         0.         0.16666667 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.16666667 ... 0.         0.         0.        ]
 [0.         0.         0.33333333 ... 0.         0.         0.        ]
 [0.         0.         0.16666667 ... 0.         0.         0.        ]]
[[0.         0.         0.05555556 ... 0.         0.         0.        ]
 [0.         0.         0.05555556 ... 0.         0.         0.        ]
 [0.         0.         0.05555556 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.27777778 ... 0.         0.         0.        ]
 [0.         0.         0.27777778 ... 0.         0.         0.        ]
 [0.         0.         0.11111111 ... 0.         0.         0.        ]]


In [5]:
# construção da variavel X e variavel Y
# coluna 1 --> valores de 'Act'
# coluna 2:__ --> todas as colunas do dataset excepto de 'Act' 

x_train = data_train[:,2:]
y_train = data_train[:,1]
x_test = data_test[:,2:]
y_test = data_test[:,1]


In [9]:
# --> Neural Network 


n_features = x_train.shape[1]

#número de neurónios
#o número de neurónios foram calculados a partir de números multiplos de 2
#o número inicial de neurónios (n_neurons_1) é superior ao número de colunas do dataset
n_neurons_1 = 4096
n_neurons_2 = 2048
n_neurons_3 = 1024
n_neurons_4 = 512
n_neurons_5 = 256
n_neurons_6 = 128
n_neurons_7 = 64


#inicialização
net = tf.InteractiveSession()

#Criação das placeholders (variavel x, variavel y)
x = tf.placeholder(dtype = tf.float32, shape=[None, n_features])
y = tf.placeholder(dtype = tf.float32, shape=[None])

#inicializadores
sigma = 1
weight_initializer = tf.variance_scaling_initializer(mode='fan_avg', distribution='uniform', scale=sigma)
bias_initializer = tf.zeros_initializer()

# Hidden weights
W_hidden_1 = tf.Variable(weight_initializer([n_features, n_neurons_1]))
bias_hidden_1 = tf.Variable(bias_initializer([n_neurons_1]))
W_hidden_2 = tf.Variable(weight_initializer([n_neurons_1, n_neurons_2]))
bias_hidden_2 = tf.Variable(bias_initializer([n_neurons_2]))
W_hidden_3 = tf.Variable(weight_initializer([n_neurons_2, n_neurons_3]))
bias_hidden_3 = tf.Variable(bias_initializer([n_neurons_3]))
W_hidden_4 = tf.Variable(weight_initializer([n_neurons_3, n_neurons_4]))
bias_hidden_4 = tf.Variable(bias_initializer([n_neurons_4]))
W_hidden_5 = tf.Variable(weight_initializer([n_neurons_4, n_neurons_5]))
bias_hidden_5 = tf.Variable(bias_initializer([n_neurons_5]))
W_hidden_6 = tf.Variable(weight_initializer([n_neurons_5, n_neurons_6]))
bias_hidden_6 = tf.Variable(bias_initializer([n_neurons_6]))
W_hidden_7 = tf.Variable(weight_initializer([n_neurons_6, n_neurons_7]))
bias_hidden_7 = tf.Variable(bias_initializer([n_neurons_7]))


# Output weights
W_out = tf.Variable(weight_initializer([n_neurons_7, 1]))
bias_out = tf.Variable(bias_initializer([1]))


# Hidden layer
hidden_1 = tf.nn.relu(tf.add(tf.matmul(x, W_hidden_1), bias_hidden_1))
hidden_2 = tf.nn.relu(tf.add(tf.matmul(hidden_1, W_hidden_2), bias_hidden_2))
hidden_3 = tf.nn.relu(tf.add(tf.matmul(hidden_2, W_hidden_3), bias_hidden_3))
hidden_4 = tf.nn.relu(tf.add(tf.matmul(hidden_3, W_hidden_4), bias_hidden_4))
hidden_5 = tf.nn.relu(tf.add(tf.matmul(hidden_4, W_hidden_5), bias_hidden_5))
hidden_6 = tf.nn.relu(tf.add(tf.matmul(hidden_5, W_hidden_6), bias_hidden_6))
hidden_7 = tf.nn.relu(tf.add(tf.matmul(hidden_6, W_hidden_7), bias_hidden_7))



# Output layer
out = tf.transpose(tf.add(tf.matmul(hidden_7, W_out), bias_out))

# Cálculo da função de custo
#Calcula a medida de desvio entre as previsões da rede e as observações
#para problemas de regressão é usual o cálculo da mean squared error (MSE)
mse = tf.reduce_mean(tf.squared_difference(out, y))

# Optimização
#surge na necessidade de adaptar o peso da rede e as variaveis bias durante o treino da rede
#utiliza os gradientes do peso da rede e das variaveis bias de forma a minimizar o valor da função de custo
opt = tf.train.AdamOptimizer().minimize(mse)

# após a definição das variaveis, dos placeholders, inicializadores e otimização da rede, é necessário treinar a rede
# para isso foi realizado o treino através de minibatch
#o dataset treino é dividido pelo número de batch definido e estes dados vão "alimentando" a rede sequencialmente

# Init
net.run(tf.global_variables_initializer())

#o número de amostras de batch é cerca de 1/4 do n_neurons_1
# Fit neural net
batch_size = 1024
mse_train = []
mse_test = []


# Run
epochs = 10
for e in range(epochs):
    # Shuffle training data
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_train = x_train[shuffle_indices]
    y_train = y_train[shuffle_indices]

    # Treino Minibatch 
    for i in range(0, len(y_train) // batch_size):
        start = i * batch_size
        batch_x = x_train[start:start + batch_size]
        batch_y = y_train[start:start + batch_size]

        # otimização do batch
        net.run(opt, feed_dict={x: batch_x, y: batch_y})
        
        # Progresso
        # MSE para treino e test
        mse_train.append(net.run(mse, feed_dict={x: x_train, y: y_train}))
        mse_test.append(net.run(mse, feed_dict={x: x_test, y: y_test}))
        print('MSE Train: ', mse_train[-1])
        print('MSE Test: ', mse_test[-1])
        # Previsão
        pred = net.run(out, feed_dict={x: x_test})

#com base todas as previsões do minibatch, é escolhido a melhor previsão com base no MSE, isto é, quanto mais perto de zero melhor é a previsão
mse_final = net.run(mse, feed_dict={x:x_test, y:y_test})
print('MSE final -->',mse_final)
net.close()

MSE Train:  0.008789875
MSE Test:  0.011115465
MSE Train:  0.050459225
MSE Test:  0.05941154
MSE Train:  0.10735489
MSE Test:  0.12874766
MSE Train:  0.00012129815
MSE Test:  0.0011516084
MSE Train:  0.006155524
MSE Test:  0.008423004
MSE Train:  0.0018749144
MSE Test:  0.0031599146
MSE Train:  0.0011701662
MSE Test:  0.0022924596
MSE Train:  0.00055856205
MSE Test:  0.0016660908
MSE Train:  1.9576682e-05
MSE Test:  0.0010032573
MSE Train:  0.00038173472
MSE Test:  0.0013491831
MSE Train:  1.246201e-05
MSE Test:  0.0009979954
MSE Train:  0.00019759467
MSE Test:  0.0012480618
MSE Train:  4.2230688e-05
MSE Test:  0.0010544837
MSE Train:  5.8029185e-05
MSE Test:  0.00102637
MSE Train:  2.7818804e-05
MSE Test:  0.0010019131
MSE Train:  6.3694635e-05
MSE Test:  0.0010964437
MSE Train:  2.234705e-06
MSE Test:  0.000995408
MSE Train:  5.8813042e-05
MSE Test:  0.0010253453
MSE Train:  1.9505638e-05
MSE Test:  0.0010316463
MSE Train:  2.2066877e-05
MSE Test:  0.0010352867
MSE Train:  4.9592498e