In [1]:
import pandas as pd
import tensorflow as tf

dataframe = pd.read_csv("diabetes.csv")
dataframe.shape

(768, 9)

In [2]:
dataframe.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
val_dataframe = dataframe.sample(frac=0.2, random_state=1337)

In [4]:
train_dataset = dataframe.drop(val_dataframe.index)

In [5]:
print("Using %d samples for training and %d for validation" 
      % (len(train_dataset), len(val_dataframe)))

Using 614 samples for training and 154 for validation


In [6]:
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("Outcome")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

In [7]:
train_ds = dataframe_to_dataset(train_dataset)

In [8]:
val_ds = dataframe_to_dataset(val_dataframe)

In [9]:
for x,y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

Input: {'Pregnancies': <tf.Tensor: shape=(), dtype=int64, numpy=6>, 'Glucose': <tf.Tensor: shape=(), dtype=int64, numpy=154>, 'BloodPressure': <tf.Tensor: shape=(), dtype=int64, numpy=78>, 'SkinThickness': <tf.Tensor: shape=(), dtype=int64, numpy=41>, 'Insulin': <tf.Tensor: shape=(), dtype=int64, numpy=140>, 'BMI': <tf.Tensor: shape=(), dtype=float64, numpy=46.1>, 'DiabetesPedigreeFunction': <tf.Tensor: shape=(), dtype=float64, numpy=0.571>, 'Age': <tf.Tensor: shape=(), dtype=int64, numpy=27>}
Target: tf.Tensor(0, shape=(), dtype=int64)


In [10]:
train_ds = train_ds.batch(32)

In [11]:
val_ds = val_ds.batch(32)

In [12]:
from tensorflow.keras.layers.experimental.preprocessing import IntegerLookup
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

In [13]:
def encode_numerical_feature(feature, name, dataset):
    normalizer = Normalization()
    feature_ds = dataset.map(lambda x,y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    normalizer.adapt(feature_ds)
    encoded_feature = normalizer(feature)
    return encoded_feature

In [14]:
def encode_categorical_feature(feature, name, dataset: tf.data.Dataset, is_string):
    lookup_class = StringLookup if is_string else IntegerLookup
    lookup = lookup_class(output_mode="binary")
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    lookup.adapt(feature_ds)
    encoded_feature = lookup(feature)
    return encoded_feature