In [0]:
from google.colab import drive
drive.mount('/content/drive')
cd drive/'My Drive'

In [0]:
import pandas as pd
X_train = pd.read_csv("tox21_dense_train.csv")
X_test = pd.read_csv("tox21_dense_test.csv")
Y_train = pd.read_csv("tox21_labels_train.csv")
Y_test = pd.read_csv("tox21_labels_test.csv")

In [0]:
# drop first column that contains names
X_train = X_train.drop(X_train.columns[[0]], axis=1)
X_test = X_test.drop(X_test.columns[[0]], axis=1)

Y_train = Y_train.drop(Y_train.columns[[0]], axis=1)
Y_test = Y_test.drop(Y_test.columns[[0]], axis=1)

In [0]:
# transform NaN in 0 in the labels
Y_train = Y_train.fillna(0)
Y_test = Y_test.fillna(0)

In [0]:
# check if all features are numeric
set(X_train.dtypes.append(X_test.dtypes))

In [0]:
# normalize features in 0 mean and 1 std
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train.append(X_test).values)
X_train = pd.DataFrame(scaler.transform(X_train.values), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test.values), columns=X_test.columns)

**Use only of the following method to reduce number of fatures**

In [0]:
# correlation analysis and drop correlated features
import numpy as np
# Create correlation matrix
corr_matrix = X_train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
print(len(to_drop))

# Drop the features
X_train.drop(X_train[to_drop], axis=1)
X_test.drop(X_test[to_drop], axis=1)

In [0]:
# PCA features reduction
from sklearn.decomposition import KernelPCA

features_limit = 100
columns = ['col' + str(x) for x in range(features_limit)]
PCA_transformer = KernelPCA(n_components=features_limit, kernel='rbf', n_jobs=-1)
PCA_transformer.fit(X_train.append(X_test).values)
X_train = pd.DataFrame(PCA_transformer.transform(X_train.values), columns = columns)
X_test = pd.DataFrame(PCA_transformer.transform(X_test.values), columns = columns)

In [0]:
from keras.callbacks import EarlyStopping
import keras.optimizers
from keras.layers import Input, Dense
from keras.models import Model

# AUTOENCODER
# Define early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=15, min_delta=0.001 ,restore_best_weights=True)

encoding_dim1 = int(X_train.shape[1] / 2)
encoding_dim2 = 100
columns = ['col' + str(x) for x in range(encoding_dim2)]

input_layer = Input(shape=(X_train.shape[1],))
# "encoded" is the encoded representation of the input
encoded1 = Dense(encoding_dim1, activation='relu')(input_layer)
bottleneck = Dense(encoding_dim2, activation='relu')(encoded1)
# "decoded" is the lossy reconstruction of the input
decoded1 = Dense(encoding_dim1, activation='relu')(bottleneck)
output_layer = Dense(X_train.shape[1], activation='sigmoid')(decoded1)
# this model maps an input to its reconstruction
autoencoder = Model(input_layer, output_layer)
# this model maps an input to its encoded representation
encoder = Model(input_layer, bottleneck)

# compile the model
autoencoder.compile(optimizer='adam', loss='mse', metrics = ['accuracy'])
autoencoder.summary()
# fit the autoencoder
autoencoder.fit(X_train.append(X_test).values, X_train.append(X_test).values, validation_split = 0.1, epochs=300, batch_size=256, verbose=True, callbacks = [es], use_multiprocessing = True)
# extract representation
X_train = pd.DataFrame(encoder.predict(X_train.values), columns = columns)
X_test = pd.DataFrame(encoder.predict(X_test.values), columns = columns)

In [0]:
print(X_train)
print(X_test)

print(Y_train)
print(Y_test)