# Toxicity - Matteo Mistri and Daniele Papetti
**Tox21 dataset analysis and prediction**

Please, do note that this notebook is thought to run on colab and using google drive as storage system, so you may need to change the directories of input and output in order to make the notebook run locally. If you have possible suggestions to fix this problem, you can write us. Any suggestion will be appreciated :)

**Dataset input and preprocessing**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
cd drive/'My Drive'

In [0]:
import pandas as pd
X_train = pd.read_csv("./AML_project/tox21_dense_train.csv")
X_test = pd.read_csv("./AML_project/tox21_dense_test.csv")
Y_train = pd.read_csv("./AML_project/tox21_labels_train.csv")
Y_test = pd.read_csv("./AML_project/tox21_labels_test.csv")

We decide to drop the names of the samples because we are sure that it is an irrelevant feature for the further operations.
We substitute the NaN values in the labels with 0 value since we assume that if the test was not performed, the doctors would have thought that the molecule would have not been involved in that biological pathway. 

In [0]:
# drop first column that contains names
X_train = X_train.drop(X_train.columns[[0]], axis = 1)
X_test = X_test.drop(X_test.columns[[0]], axis = 1)

Y_train = Y_train.drop(Y_train.columns[[0]], axis = 1)
Y_test = Y_test.drop(Y_test.columns[[0]], axis = 1)

In [0]:
# transform NaN in 0 in the labels
Y_train = Y_train.fillna(0)
Y_test = Y_test.fillna(0)

In [0]:
# distribuzione etichette
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

for l in Y_train.columns:
  unique_values_count = Counter(Y_train[l])
  print("{}: {}".format(l, unique_values_count))

  plt.figure(figsize = (8, 6))
  sns.countplot(x = l, data = Y_train)
  plt.xlabel("Values", fontsize = 16)
  plt.ylabel("Number of instances", fontsize = 16)
  plt.title(' '.join(l.split('.')), fontsize = 18)
  plt.xticks(fontsize = 12)
  plt.yticks(fontsize = 12)
  plt.savefig("./AML_project/images/png/hist-{}.png".format(''.join(l.split('.'))))
  plt.savefig("./AML_project/images/pdf/hist-{}.pdf".format(''.join(l.split('.'))))
  plt.close()

In [0]:
# ricerca e gestione outliers (3 * Q3/4-q1/4)
from collections import Counter

# remove outliers
outliers = list()
# if we consider feature by feature, we drop all the dataset
# we count how many time a record is considered outlier
for column in X_train.columns:
  mean = X_train[column].mean()
  q1 = X_train[column].quantile(1 / 4)
  q3 = X_train[column].quantile(3 / 4)
  threshold = 3 * (q3 - q1)

  # Indexes of outliers
  outliers.extend(X_train[X_train[column] > mean + threshold].index.values.tolist())      
  outliers.extend(X_train[X_train[column] < mean - threshold].index.values.tolist())

# drop a record if it is considered to be an outlier in more that 1/4 of the features
out_counter = Counter(outliers)
toDrop = [k for k, v in zip(out_counter.keys(), out_counter.values()) if v > 200]

# Delete these row indexes from dataFrame
X_train.drop(toDrop, inplace = True)
Y_train.drop(toDrop, inplace = True)
X_train.reset_index(drop = True, inplace = True)
Y_train.reset_index(drop = True, inplace = True)

In [0]:
# correlation between labels
# Create correlation matrix
corr_matrix = Y_train.append(Y_test).corr().abs()

plt.figure(figsize = (8, 6))
'''
issue with the labels cut off, know issue with matplotlib
sns.heatmap(corr_matrix, fmt = '.2f', annot = True, vmin=0, vmax=1,
            xticklabels = ['\n'.join(x.split('.')) for x in corr_matrix],
            yticklabels = ['\n'.join(x.split('.')) for x in corr_matrix])
'''
sns.heatmap(corr_matrix, fmt = '.2f', annot = True, vmin = 0, vmax = 1,
            xticklabels = range(12), yticklabels = range(12))
plt.title("Labels correlation matrix heatmap", fontsize = 18)
plt.tight_layout()
plt.savefig("./AML_project/images/png/labels_corr_matrix_heatmap.png")
plt.savefig("./AML_project/images/pdf/labels_corr_matrix_heatmap.pdf")
plt.close()

In [0]:
# correlation between features and labels
corr_matrix = X_train.append(X_test).merge(Y_train.append(Y_test),
                                           right_index = True, left_index = True).corr().abs()
# Print most correlated feature for each class
s = set()
for label in Y_train.columns:
  # extract correlation column of the considered class
  corr_col = corr_matrix[label]
  # remove the elements whose indes is a label
  cleaned_col = corr_col.drop(Y_train.columns, axis = 0)
  # sort the result
  sorted_col = cleaned_col.sort_values(ascending = False)
  # extract the most correlated features for the considered class
  s.add(sorted_col.index[0])
  print("Class {} is mosty correlated with feature {} with a value of {}".format(label, sorted_col.index[0], round(sorted_col[0], 3)))
print("unique features: {}".format(len(s)))

In [None]:
# DEBUG THIS, why -1 values
high_corr_features_count = {k: (sum(1 for x in corr_matrix[k] if x > 0.90) - 1) for k in corr_matrix.columns} # -1 because evry feature is correlated with itself with value 1
print(high_corr_features_count)
for k in high_corr_features_count.keys():
  if high_corr_features_count[k] == -1:
    print(k)
c = Counter(high_corr_features_count.values())
print(c)

plt.Figure(figsize = (8, 6))
plt.yscale('log')
plt.ylim(bottom = min(c.values()) - 0.2, top = max(c.values()) + 50)
for p, v in c.items():
  plt.scatter(p, v)
plt.xlabel("Number of highly correlated features/labels", fontsize = 16)
plt.ylabel("Number of features/labels", fontsize = 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
plt.title("Distribution of correlated features/labels", fontsize = 18)
plt.tight_layout()
plt.savefig("./AML_project/images/png/distribution_high_corr.png")
plt.savefig("./AML_project/images/pdf/distribution_high_corr.pdf")

In [0]:
# search for missing values in features
print(X_train.isnull().any().any())
print(X_test.isnull().any().any())

In [0]:
# check if all features are numeric
set(X_train.dtypes.append(X_test.dtypes))

In [0]:
# normalize features in 0 mean and 1 std
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train.append(X_test).values)
X_train = pd.DataFrame(scaler.transform(X_train.values), columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test.values), columns = X_test.columns)

**Use only one of the following method to reduce number of features**

In [0]:
# correlation analysis and drop correlated features
# create correlation matrix
import numpy as np
corr_matrix = X_train.corr().abs()

# select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))

# find index of feature columns with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

# drop the features
X_train.drop(X_train[to_drop], axis = 1)
X_test.drop(X_test[to_drop], axis = 1)
X_train.reset_index(drop = True, inplace = True)
X_test.reset_index(drop = True, inplace = True)

In [0]:
# PCA features reduction
from sklearn.decomposition import PCA

features_limit = 100
columns = ['col' + str(x) for x in range(features_limit)]
PCA_transformer = PCA(n_components = features_limit)
PCA_transformer.fit(X_train.append(X_test).values)
X_train = pd.DataFrame(PCA_transformer.transform(X_train.values), columns = columns)
X_test = pd.DataFrame(PCA_transformer.transform(X_test.values), columns = columns)

In [0]:
from keras.callbacks import EarlyStopping
import keras.optimizers
from keras.layers import Input, Dense
from keras.models import Model

# AUTOENCODER
# Define early stopping
es = EarlyStopping(monitor = 'val_loss', mode = 'min', 
                   verbose = 1, patience = 15, min_delta = 0.001,
                   restore_best_weights = True)

encoding_dim1 = int(X_train.shape[1] / 2)
encoding_dim2 = int(encoding_dim1 / 2)
encoding_dim3 = 100
columns = ['col' + str(x) for x in range(encoding_dim3)]

input_layer = Input(shape = (X_train.shape[1],))
# "encoded" is the encoded representation of the input
encoded1 = Dense(encoding_dim1, activation = 'relu')(input_layer)
encoded2 = Dense(encoding_dim2, activation = 'relu')(encoded1)
bottleneck = Dense(encoding_dim3, activation = 'relu')(encoded2)
# "decoded" is the lossy reconstruction of the input
decoded2 = Dense(encoding_dim2, activation = 'relu')(bottleneck)
decoded1 = Dense(encoding_dim1, activation = 'relu')(decoded2)
output_layer = Dense(X_train.shape[1], activation = 'linear')(decoded1)
# this model maps an input to its reconstruction
autoencoder = Model(input_layer, output_layer)
# this model maps an input to its encoded representation
encoder = Model(input_layer, bottleneck)

# compile the model
autoencoder.compile(optimizer = 'adam', loss = 'mse')
autoencoder.summary()
# fit the autoencoder
autoencoder.fit(X_train.append(X_test).values, X_train.append(X_test).values, 
                validation_split = 0.1, epochs = 300, batch_size = 256, 
                verbose = True, callbacks = [es], use_multiprocessing = True)
# extract representation
X_train = pd.DataFrame(encoder.predict(X_train.values), columns = columns)
X_test = pd.DataFrame(encoder.predict(X_test.values), columns = columns)

**Classification of the records**

In [0]:
# TODO
# confronto con la medesima DNN delle 3 feature extraction
# AutoML (maybe AutoKeras) su DNN o RandomForest


In [0]:
# evaluate auc for a given model
from sklearn.metrics import roc_auc_score
import numpy as np

def evaluate_performance(model, test_features, test_label):
  test_predictions = model.predict(test_features)
  test_pred_df = pd.DataFrame(data = test_predictions, columns = test_label.columns)
  auc = dict()
  for c_pred, c_true in zip(test_pred_df, test_label):
    auc[c_true] = roc_auc_score(y_true = test_label[c_true], y_score = test_pred_df[c_pred])

  for k in auc:
    print("{}: {}".format(k, auc[k]))

  print("\nmean: {}".format(np.mean(list(auc.values()))))
  return((auc, np.mean(list(auc.values()))))

In [0]:
# confusion matrix as a dictionary
# Predict and evaluate performances
def get_confusion_matrix(model, test_features, test_label):
  predictions = model.predict(test_features)
  predictions = [[round(x,1) for x in l] for l in predictions]
  #print(predictions[:10])
  #print(test_label.head)
  d = {k: {'t1': 0, 't0': 0, 'f1': 0, 'f0': 0} for k in range (1, 13)}
  for preds, trues in zip(predictions, test_label.itertuples()):
    for p, t, k in zip(preds, trues[1:], range(1, 13)):
      p = 1 if p > 0.5 else 0
      if p == t and p == 0:
          d[k]['t0'] = d[k]['t0'] + 1
      if p == t and p == 1:
          d[k]['t1'] = d[k]['t1'] + 1
      if p != t and p == 0:
          d[k]['f0'] = d[k]['f0'] + 1
      if p != t and p == 0:
          d[k]['f1'] = d[k]['f1'] + 1
  for k in d:
      print(d[k])
  return d
#tmp = [get_class(x) for x in y_val]
#print(classification_report(tmp, predictions))

In [0]:
### CLASSIC NN
from keras.models import Sequential
from sklearn.metrics import roc_auc_score
from keras.callbacks import EarlyStopping, Callback
from keras.layers import Dense, BatchNormalization, Dropout, Activation
from keras import regularizers
import pandas as pd
import tensorflow as tf
import keras.backend.tensorflow_backend as tfb
from sklearn.model_selection import train_test_split, KFold

POS_WEIGHT = 20

# def custom loss(weighted_binary_crossentropy not defined in keras)
def weighted_binary_crossentropy(target, output):
  # transform back to logits
  _epsilon = tfb._to_tensor(tfb.epsilon(), output.dtype.base_dtype)
  output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
  output = tf.log(output / (1 - output))
  # compute weighted loss
  loss = tf.nn.weighted_cross_entropy_with_logits(targets = target,
                                                  logits = output,
                                                  pos_weight = POS_WEIGHT)
  return tf.reduce_mean(loss, axis = -1)

# Define early stopping
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 10, min_delta = 0.001, restore_best_weights = True)

class roc_callback(Callback):
  def __init__(self,training_data,validation_data):
    self.x = training_data[0]
    self.y = training_data[1]
    self.x_val = validation_data[0]
    self.y_val = validation_data[1]

  def on_train_begin(self, logs = dict()):
    return

  def on_train_end(self, logs = dict()):
    return

  def on_epoch_begin(self, epoch, logs = dict()):
    return

  def on_epoch_end(self, epoch, logs = dict()):
    y_pred = pd.DataFrame(data = self.model.predict(self.x), columns = self.y.columns)
    y_pred_val = pd.DataFrame(data = self.model.predict(self.x_val), columns = self.y.columns)
    average_roc = 0.0
    average_roc_val = 0.0
    for i in self.y.columns:
      average_roc += roc_auc_score(self.y[i], y_pred[i])
      average_roc_val += roc_auc_score(self.y_val[i], y_pred_val[i])

    average_roc = average_roc / 12
    average_roc_val = average_roc_val / 12
    print('Average-roc-auc: {}   Average-roc-auc_val: {}'.format(round(average_roc, 4), 
                                                                  round(average_roc_val,4)))
    return

    def on_batch_begin(self, batch, logs = dict()):
      return

    def on_batch_end(self, batch, logs = dict()):
      return

kfold = KFold(n_splits=10, shuffle=True, random_state=1)
average_roc = 0.0
for t, v in kfold.split(X_train, Y_train):
  # NN
  NN = Sequential()
  NN.add(Dense(512, input_shape=(X_train.shape[1],)))
  NN.add(BatchNormalization())
  NN.add(Activation('relu'))
  NN.add(Dropout(rate=0.5))
  NN.add(Dense(256))
  NN.add(BatchNormalization())
  NN.add(Activation('relu'))
  NN.add(Dropout(rate=0.5))
  NN.add(Dense(128))
  NN.add(BatchNormalization())
  NN.add(Activation('relu'))
  NN.add(Dropout(rate=0.5))
  NN.add(Dense(12, activation = 'sigmoid'))

  # Compile model
  NN.compile(optimizer = 'adam', loss = weighted_binary_crossentropy)
  # Fit tne network
  learning_process_NN = NN.fit(X_train.iloc[t], Y_train.iloc[t], validation_data = (X_train.iloc[v], Y_train.iloc[v]), 
                              callbacks = [roc_callback(training_data = (X_train.iloc[t], Y_train.iloc[t]), 
                                                        validation_data = (X_train.iloc[v], Y_train.iloc[v])), es], 
                              epochs = 300, batch_size = 128, verbose = True, use_multiprocessing = True)
  average_roc += evaluate_performance(NN, X_test, Y_test)[1]

print("\n\n\n Mean penformance on 10 fold cross validation of the baseline model is {}".format(average_roc/10))

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_history(network_history):
    plt.figure()
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.plot(network_history.history['loss'])
    plt.plot(network_history.history['val_loss'])
    plt.legend(['Training', 'Validation'])

plot_history(learning_process_NN)
plt.savefig("learning_process_baseline.pdf")