# Toxicity - Matteo Mistri and Daniele Papetti
**Tox21 dataset analysis and prediction**

**Dataset input and preprocessing**

In [18]:
import pandas as pd
X_train = pd.read_csv("./dataset/tox21_dense_train.csv")
X_test = pd.read_csv("./dataset/tox21_dense_test.csv")
Y_train = pd.read_csv("./dataset/tox21_labels_train.csv")
Y_test = pd.read_csv("./dataset/tox21_labels_test.csv")

We decide to drop the names of the samples because we are sure that it is an irrelevant feature for the further operations.
We substitute the NaN values in the labels with 0 value since we assume that if the test was not performed, the doctors would have thought that the molecule would have not been involved in that biological pathway. 

In [19]:
# drop first column that contains names
X_train = X_train.drop(X_train.columns[[0]], axis=1)
X_test = X_test.drop(X_test.columns[[0]], axis=1)

Y_train = Y_train.drop(Y_train.columns[[0]], axis=1)
Y_test = Y_test.drop(Y_test.columns[[0]], axis=1)

In [20]:
# transform NaN in 0 in the labels
Y_train = Y_train.fillna(0)
Y_test = Y_test.fillna(0)

In [21]:
# TODO DATA EXPLORATION
# distribuzione etichette
# correlazione tra le etichette
# analisi missing values nelle feature (X_train.isnull().any().any())
    # drop o imputazione dei sample con missing
# ricerca e gestione outliers (3 * Q3/4-q1/4)
# Cercare feature fortemente correlate a labels

In [22]:
# check if all features are numeric
set(X_train.dtypes.append(X_test.dtypes))

{dtype('int64'), dtype('float64')}

In [23]:
# normalize features in 0 mean and 1 std
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train.append(X_test).values)
X_train = pd.DataFrame(scaler.transform(X_train.values), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test.values), columns=X_test.columns)

**Use only one of the following method to reduce number of features**

In [None]:
# correlation analysis and drop correlated features
import numpy as np
# Create correlation matrix
corr_matrix = X_train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
print(len(to_drop))

# Drop the features
X_train.drop(X_train[to_drop], axis=1)
X_test.drop(X_test[to_drop], axis=1)

In [None]:
# PCA features reduction
from sklearn.decomposition import PCA

features_limit = 100
columns = ['col' + str(x) for x in range(features_limit)]
PCA_transformer = PCA(n_components = features_limit)
PCA_transformer.fit(X_train.append(X_test).values)
X_train = pd.DataFrame(PCA_transformer.transform(X_train.values), columns = columns)
X_test = pd.DataFrame(PCA_transformer.transform(X_test.values), columns = columns)

In [25]:
from keras.callbacks import EarlyStopping
import keras.optimizers
from keras.layers import Input, Dense
from keras.models import Model

# AUTOENCODER
# Define early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=15, min_delta=0.001 ,restore_best_weights=True)

encoding_dim1 = int(X_train.shape[1] / 2)
encoding_dim2 = int(encoding_dim1 / 2)
encoding_dim3 = 100
columns = ['col' + str(x) for x in range(encoding_dim3)]

input_layer = Input(shape=(X_train.shape[1],))
# "encoded" is the encoded representation of the input
encoded1 = Dense(encoding_dim1, activation='relu')(input_layer)
encoded2 = Dense(encoding_dim2, activation='relu')(encoded1)
bottleneck = Dense(encoding_dim3, activation='relu')(encoded2)
# "decoded" is the lossy reconstruction of the input
decoded2 = Dense(encoding_dim2, activation='relu')(bottleneck)
decoded1 = Dense(encoding_dim1, activation='relu')(decoded2)
output_layer = Dense(X_train.shape[1], activation='linear')(decoded1)
# this model maps an input to its reconstruction
autoencoder = Model(input_layer, output_layer)
# this model maps an input to its encoded representation
encoder = Model(input_layer, bottleneck)

# compile the model
autoencoder.compile(optimizer='adam', loss='mse', metrics = ['accuracy'])
autoencoder.summary()
# fit the autoencoder
autoencoder.fit(X_train.append(X_test).values, X_train.append(X_test).values, validation_split = 0.1, epochs=300, batch_size=256, verbose=True, callbacks = [es], use_multiprocessing = True)
# extract representation
X_train = pd.DataFrame(encoder.predict(X_train.values), columns = columns)
X_test = pd.DataFrame(encoder.predict(X_test.values), columns = columns)

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 801)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 400)               320800    
_________________________________________________________________
dense_10 (Dense)             (None, 200)               80200     
_________________________________________________________________
dense_11 (Dense)             (None, 100)               20100     
_________________________________________________________________
dense_12 (Dense)             (None, 200)               20200     
_________________________________________________________________
dense_13 (Dense)             (None, 400)               80400     
_________________________________________________________________
dense_14 (Dense)             (None, 801)               3212

KeyboardInterrupt: 

In [None]:
# TODO
# controllare funzione di attivazione autoencoder e relativa loss
# data visualization in 2/3D per vedere se clusterizza (a occhio)

In [24]:
# Test print
print(X_train)
print(X_test)

print(Y_train)
print(Y_test)

             AW   AWeight      Arto   BertzCT      Chi0      Chi1     Chi10  \
0      4.261302 -0.308802  0.836605  1.490057  1.142090  1.303726  1.189859   
1      0.717300  1.605286  0.272748  1.359206  0.883107  0.896950  1.727203   
2     -0.099952 -0.302049  0.724850  1.519900  4.104884  3.969533  4.004521   
3      5.733907 -0.355441  0.089875  1.917044  4.626837  4.364390  1.567584   
4      0.317784 -0.333493  0.572456  2.360100  7.019719  7.080712  3.758391   
5     -0.361586 -0.289809 -0.057439  0.142524  0.390553  0.390465 -0.284488   
6     -0.361586  3.033148  0.790887  0.036925 -0.651358 -0.620422 -0.587887   
7     -0.361586  0.582188 -0.692413 -0.555347 -0.173379 -0.260756 -0.632970   
8      0.224833 -0.299939 -0.057439  0.723317  0.778343  0.897896  0.573314   
9      0.521380 -0.129844  0.211790  1.189330  1.465600  1.576172  1.015618   
10     0.521380 -0.157490 -0.860047  0.477685  1.628913  1.584686 -0.099281   
11     0.205286 -0.139341  0.618174  0.822029  0.801

**Classification of the records**

In [None]:
# TODO
# creazione della DNN cin keras
    # esplorazione preliminare dei parametri (con tutte le features)
# confronto con la medesima DNN delle 3 feature extraction
# AutoML (maybe AutoKeras) su DNN o RandomForest
