In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

from  IPython import display
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [2]:
BATCH_SIZE = 32

# Reading data using pandas and transforming to tf Dataset

In [3]:
features = pd.read_csv("data/train_features.csv")
targets = pd.read_csv("data/train_targets_scored.csv")

cols_features = features.columns
cols_targets = targets.columns

num_features = len(cols_features)
num_targets = len(cols_targets)

print("Number of features:" , num_features)
print("Number of targets:" , num_targets)

Number of features: 876
Number of targets: 207


In [4]:
# add label to dataset 
df = features.copy()
targets.pop("sig_id")
df["label"] = pd.Series(pd.Series(targets.values.tolist()))

# split data
train, val = train_test_split(df, test_size=0.2)

print(len(train), 'train examples')
print(len(val), 'val examples')

19051 train examples
4763 val examples


In [5]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    
    dataframe = dataframe.copy()
    dataframe.pop('sig_id')
    labels = dataframe.pop('label')

    # stack multi label target
    labels = [tf.stack(label) for label in labels]
    
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(features))
    ds = ds.batch(batch_size)
    return ds

In [6]:
train_ds = df_to_dataset(train, batch_size=BATCH_SIZE)
val_ds = df_to_dataset(val, shuffle=False, batch_size=BATCH_SIZE)

In [7]:
for feature_batch, label_batch in train_ds.take(1):
    print('First 5 features:', list(feature_batch.keys())[:5])
    print('A batch of cp_types:', feature_batch['cp_type'].numpy())
    print('A batch of cp_doses:', feature_batch['cp_dose'].numpy())
    print('A batch of cp_times:', feature_batch['cp_time'].numpy())
    print('A batch of g-0:', feature_batch['g-0'].numpy())
    
    
    print('A batch of targets:', label_batch.numpy()) 

First 5 features: ['cp_type', 'cp_time', 'cp_dose', 'g-0', 'g-1']
A batch of cp_types: [b'trt_cp' b'trt_cp' b'trt_cp' b'ctl_vehicle' b'ctl_vehicle' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'ctl_vehicle'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'ctl_vehicle']
A batch of cp_doses: [b'D1' b'D2' b'D1' b'D1' b'D1' b'D1' b'D2' b'D2' b'D2' b'D1' b'D2' b'D1'
 b'D1' b'D1' b'D1' b'D2' b'D2' b'D1' b'D1' b'D1' b'D2' b'D2' b'D1' b'D2'
 b'D1' b'D1' b'D2' b'D1' b'D1' b'D1' b'D1' b'D2']
A batch of cp_times: [72 48 48 48 72 48 24 24 48 72 24 24 48 72 24 48 48 24 24 24 24 24 48 24
 72 48 72 24 72 48 48 48]
A batch of g-0: [-0.1264 -0.6446 -0.2149 -0.1447  0.5396  0.0439 -0.8981 -1.004   0.3372
  3.215  -0.0893 -0.4287 -0.0626 -0.005   0.0222 -0.0978 -0.5466  1.227
 -1.026  -0.0915 -1.415  -0.1153  0.934   0.3422 -0.6679 -0.524   1.4