# Set up

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/lish-moa/test_features.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/sample_submission.csv


## Import tensorflow

In [2]:
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.layers.experimental import preprocessing
#from tensorboard.plugins.hparams import api as hp

### Plot libraries

In [3]:
!pip3 install -q git+https://github.com/tensorflow/docs

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots
from matplotlib import pyplot as plt

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [4]:
import pathlib
import tempfile

# Data reading

### 1. Use Pandas to create a dataframe

#### Setting our batch size

In [5]:
BATCH_SIZE = 32

#### Create dataframe for features and targets

In [6]:
features = pd.read_csv("/kaggle/input/lish-moa/train_features.csv", nrows=20)
targets = pd.read_csv("/kaggle/input/lish-moa/train_targets_scored.csv", nrows=20)
test = pd.read_csv("/kaggle/input/lish-moa/test_features.csv", nrows=20)

cols_features = features.columns
cols_targets = targets.columns

num_features = len(cols_features)
num_targets = len(cols_targets)

print("Number of features:" , num_features)
print("Number of targets:" , num_targets)

Number of features: 876
Number of targets: 207


In [7]:
gcols = [g for g in features.columns if "g-" in g]
ccols = [c for c in features.columns if "c-" in c]

In [129]:
features.head()

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
3,id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
4,id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


In [130]:
targets.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Building data pipelins

In [8]:
features_types = [str(), str(), str(), str()] + [float()]*(num_features-4)
targets_types = [str()] + [float()]*(num_targets-1)

features = tf.data.experimental.CsvDataset("/kaggle/input/lish-moa/train_features.csv",
                                           record_defaults=features_types,
                                           #select_cols
                                           header=True)

targets = tf.data.experimental.CsvDataset("/kaggle/input/lish-moa/train_targets_scored.csv",
                                          record_defaults=targets_types,
                                          header=True)

test = tf.data.experimental.CsvDataset("/kaggle/input/lish-moa/test_features.csv",
                                           record_defaults=features_types,
                                           #select_cols=(gcols + ccols),
                                           header=True)

dataset = tf.data.Dataset.zip((features, targets))

In [9]:
# split dataset into train and val
dataset_size = dataset.reduce(0, lambda x, _: x + 1).numpy()

train_size = int(0.7*dataset_size)
val_size = dataset_size - train_size

train = dataset.take(train_size)
val = dataset.skip(train_size)
val = dataset.take(val_size)

train_size = train.reduce(0, lambda x, _: x + 1).numpy()
val_size = val.reduce(0, lambda x, _: x + 1).numpy()

print("Full dataset size:", dataset_size)
print("Train dataset size:", train_size)
print("Val dataset size:", val_size)

Full dataset size: 23814
Train dataset size: 16669
Val dataset size: 7145


In [10]:
def _preprocess_line(features, targets):
    # Pack the result into a dictionary
    features = dict(zip(cols_features, features))
    features.pop('sig_id')

    targets = tf.stack(targets[1:])
    
    return features, targets

train = train.map(_preprocess_line)
train = train.batch(BATCH_SIZE)

val = val.map(_preprocess_line)
val = val.batch(BATCH_SIZE)

In [11]:
for feature_batch, label_batch in train.take(1):
    print('First 5 features:', list(feature_batch.keys())[:5])
    print('A batch of cp_types:', feature_batch['cp_type'].numpy())
    print('A batch of cp_times:', feature_batch['cp_time'].numpy())
    print('A batch of targets:', label_batch.numpy() )

First 5 features: ['cp_type', 'cp_time', 'cp_dose', 'g-0', 'g-1']
A batch of cp_types: [b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'ctl_vehicle' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp']
A batch of cp_times: [b'24' b'72' b'48' b'48' b'72' b'24' b'24' b'48' b'48' b'48' b'72' b'48'
 b'48' b'48' b'72' b'48' b'48' b'24' b'72' b'48' b'48' b'48' b'72' b'72'
 b'72' b'48' b'72' b'48' b'48' b'72' b'72' b'48']
A batch of targets: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Feature Engineering

### 1. Normalization of numerical columns

Prepare encoded features array

In [12]:
feature_columns = []

In [13]:
for name in (gcols + ccols):
    feature_columns.append(feature_column.numeric_column(name))

### 2. Encoding of categorical columns

In [14]:
cp_time_type = feature_column.categorical_column_with_vocabulary_list(
      'cp_time', ['24', '48', '72'])
cp_type_type = feature_column.categorical_column_with_vocabulary_list(
      'cp_type', ['trt_cp', 'ctl_vehicle'])
cp_dose_type = feature_column.categorical_column_with_vocabulary_list(
      'cp_dose', ['D1', 'D2'])

cp_time_type_one_hot = feature_column.indicator_column(cp_time_type)
cp_type_type_one_hot = feature_column.indicator_column(cp_type_type)
cp_dose_type_one_hot = feature_column.indicator_column(cp_dose_type)

feature_columns.append(cp_time_type_one_hot)
feature_columns.append(cp_type_type_one_hot)
feature_columns.append(cp_dose_type_one_hot)

### 3. Model training

In [15]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(206)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [145]:
model.fit(train,
          validation_data=val,
          epochs=10)

Epoch 1/10
    104/Unknown - 7s 67ms/step - loss: 0.1197 - accuracy: 0.0183

KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(val)
print("Accuracy", accuracy)

In [None]:
model.summary()

# Baseline modelling

In [None]:
regularizer_histories = {}

In [None]:
def compile_and_fit(model, name, optimizer=None, max_epochs=10000):
    if optimizer is None:
        optimizer = get_optimizer()
    model.compile(optimizer=optimizer,loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                  metrics=[tf.keras.losses.BinaryCrossentropy(from_logits=True, name='binary_crossentropy'),
                           'accuracy'])

    model.summary()

    history = model.fit(
        train_ds,
        steps_per_epoch = STEPS_PER_EPOCH,
        epochs=max_epochs,
        validation_data=validate_ds,
        callbacks=get_callbacks(name),
        verbose=0)
    return history

In [None]:
logdir = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"

In [None]:
def get_callbacks(name):
    return [
        tfdocs.modeling.EpochDots(),
        tf.keras.callbacks.EarlyStopping(monitor='val_binary_crossentropy', patience=200),
        tf.keras.callbacks.TensorBoard(logdir/name),
    ]

### 1. L2 Regularization

In [None]:
l2_model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu',
               kernel_regularizer=regularizers.l2(0.00001),
               input_shape=(num_features,)),
  layers.Dense(128, activation='relu', 
               kernel_regularizer=regularizers.l2(0.00001)),
  layers.Dense(206)
])


l2_model.compile(optimizer='adam', 
                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                 metrics=[tf.keras.losses.BinaryCrossentropy(from_logits=True, name='binary_crossentropy'),'accuracy'])


In [None]:
regularizer_histories['l2'] = l2_model.fit(train,
                                           validation_data=val,
                                           callbacks=get_callbacks('l2'),
                                           epochs=10)

In [None]:
plotter = tfdocs.plots.HistoryPlotter(metric = 'accuracy', smoothing_std=10)
plt.figure(figsize=(10, 6))
plotter.plot(regularizer_histories)
plt.ylim([0.01, 0.4])

### 2. Drop out regularization

In [None]:
dp_model = tf.keras.Sequential([   
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.Dropout(.1),
    layers.Dense(128, activation='relu'),
    layers.Dropout(.1),
    layers.Dense(206)
])

dp_model.compile(optimizer='adam', 
                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True, name='binary_crossentropy'), 
                 metrics=[tf.keras.losses.BinaryCrossentropy(from_logits=True, name='binary_crossentropy'),'accuracy'])

In [None]:
regularizer_histories['dp'] = dp_model.fit(train,
                                           validation_data=val,
                                           callbacks=get_callbacks('dp'),
                                           epochs=10)

In [None]:
plt.figure(figsize=(10, 6))
plotter.plot(regularizer_histories)
plt.ylim([0.01, 0.7])

### 3. Batch Normalization

In [None]:
batch_model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(206)
])

batch_model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
regularizer_histories['batch'] = batch_model.fit(train,
                                           validation_data=val,
                                           callbacks=get_callbacks('bacth'),
                                           epochs=10)

In [None]:
plt.figure(figsize=(10, 6))
plotter.plot(regularizer_histories)
plt.ylim([0.01, 0.7])

### 4. Weight initialization

### 5. Combination

In [None]:
all_model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(128, activation='relu'), 
    layers.Dropout(.1),
    layers.Dense(206)
])

all_model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

regularizer_histories['all'] = all_model.fit(train,
                                           validation_data=val,
                                           callbacks=get_callbacks('all'),
                                           epochs=10)

In [None]:
plt.figure(figsize=(10, 6))
plotter.plot(regularizer_histories)
plt.ylim([0.01, 0.7])

# Model Variance & Bias Analysis

### 1. Plot the training and validation loss

In [None]:
plot_loss = tfdocs.plots.HistoryPlotter(metric = 'loss', smoothing_std=10)
plt.figure(figsize=(10, 6))
plot_loss.plot(regularizer_histories)
plt.ylim([0.01, 0.2])

### 2. Learning curve

# Hyperparameter tuning with Keras Tuner

In [154]:
!pip install -q -U keras-tuner
import kerastuner as kt
import IPython

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [161]:
def model_builder(hp):
    hp_units = hp.Int('units', min_value = 32, max_value = 512, step = 32)

    hp_model = tf.keras.Sequential([
        feature_layer,
        layers.Dense(units = hp_units, activation='relu'),
        layers.Dense(206)
    ])

    hp_learning_rate = hp.Choice('learning_rate', values = [1e-2, 1e-3, 1e-4]) 

    hp_model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = hp_learning_rate),
                    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                    metrics = ['accuracy'])
    return hp_model

In [165]:
tuner = kt.Hyperband(model_builder,
                     objective = 'val_accuracy', 
                     max_epochs = 10,
                     factor = 3,
                     directory = 'my_dir',
                     project_name = 'intro_to_kt')  

TypeError: __init__() got an unexpected keyword argument 'max_trials'

In [159]:
class ClearTrainingOutput(tf.keras.callbacks.Callback):
    def on_train_end(*args, **kwargs):
        IPython.display.clear_output(wait = True)

In [176]:
tuner.search(train, epochs = 2, validation_data = (val), callbacks = [ClearTrainingOutput()])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

Trial 10 Complete [00h 04m 39s]
val_accuracy: 0.17872637510299683

Best val_accuracy So Far: 0.33463960886001587
Total elapsed time: 00h 34m 00s

The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 480 and the optimal learning rate for the optimizer
is 0.001.



In [177]:
hp_model = tuner.hypermodel.build(best_hps)
hp_model.fit(train, epochs = 10, validation_data = val)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7fbf874690>