In [1]:
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
diabetes = pd.read_csv('../sample_data/pima-indians-diabetes.csv')

In [4]:
diabetes.head()

Unnamed: 0,Number_pregnant,Glucose_concentration,Blood_pressure,Triceps,Insulin,BMI,Pedigree,Age,Class
0,6,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,50,1
1,1,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,31,0
2,8,0.919598,0.52459,0.0,0.0,0.347243,0.253629,32,1
3,1,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,21,0
4,0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,33,1


In [5]:
diabetes.shape

(768, 9)

In [None]:
sns.pairplot(
    diabetes,
    x_vars=['Class'],
    y_vars=[
        'Number_pregnant',
        'Glucose_concentration',
        'Blood_pressure',
        'Triceps',
        'Insulin',
        'BMI',
        'Pedigree'
    ], 
    kind='reg',
    height=7.5
)

In [None]:
diabetes.columns

In [None]:
columns_to_normalize = ['Number_pregnant', 'Glucose_concentration', 'Blood_pressure', 'Triceps',
       'Insulin', 'BMI', 'Pedigree']

In [None]:
diabetes[columns_to_normalize] = diabetes[columns_to_normalize].apply(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

In [None]:
diabetes.head()

In [None]:
num_preg = tf.feature_column.numeric_column('Number_pregnant')
plasma_gluc = tf.feature_column.numeric_column('Glucose_concentration')
dias_press = tf.feature_column.numeric_column('Blood_pressure')
tricep = tf.feature_column.numeric_column('Triceps')
insulin = tf.feature_column.numeric_column('Insulin')
bmi = tf.feature_column.numeric_column('BMI')
diabetes_pedigree = tf.feature_column.numeric_column('Pedigree')
age = tf.feature_column.numeric_column('Age')

In [None]:
# assigned_group = tf.feature_column.categorical_column_with_vocabulary_list(
#     'Group',
#     ['A', 'B', 'C', 'D']
# )

Hash Bucket is a good option if we don't know / don't want to write out all the categories

In [None]:
assigned_group = tf.feature_column.categorical_column_with_hash_bucket(
    'Group',
    hash_bucket_size=4
)

Feature engineering: convert a numerical column to a categorical column:

Let's visualize the distribution of ages:

In [None]:
diabetes['Age'].hist(bins=20)

In [None]:
age_bucket = tf.feature_column.bucketized_column(
    age,
    boundaries=[20,30,40,50,60,70,80]
)

In [None]:
feat_cols = [
    num_preg,
    plasma_gluc,
    dias_press,
    tricep,
    insulin,
    bmi,
    diabetes_pedigree,
    assigned_group,
    age_bucket
]

Let's rip out the Lables:

In [None]:
x_data = diabetes.drop('Class', axis=1)
x_data.head()

In [None]:
lables = diabetes['Class']
lables.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_data, lables)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
input_func = tf.estimator.inputs.pandas_input_fn(
    x=X_train,
    y=y_train,
    batch_size=10,
    num_epochs=1000,
    shuffle=True
)

In [None]:
model = tf.estimator.LinearClassifier(
    feature_columns=feat_cols,
    n_classes=2
)

In [None]:
model.train(input_fn=input_func, steps=1000)

In [None]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(
    x=X_test,
    y=y_test,
    batch_size=10,
    num_epochs=1,
    shuffle=False
)

In [None]:
results = model.evaluate(eval_input_func)

In [None]:
results

In [None]:
pred_input_func = tf.estimator.inputs.pandas_input_fn(
    x=X_test,
    batch_size=10,
    num_epochs=1,
    shuffle=False
)

In [None]:
predictions = list(model.predict(pred_input_func))

In [None]:
pd.DataFrame(predictions).head()

Using a Dense Neural Network instead of a `LinearClassifier`:

In [None]:
embedded_group_col = tf.feature_column.embedding_column(assigned_group, dimension=4)

In [None]:
dnn_feat_cols = [
    num_preg,
    plasma_gluc,
    dias_press,
    tricep,
    insulin,
    bmi,
    diabetes_pedigree,
    embedded_group_col, # Change this
    age_bucket
]

In [None]:
input_func = tf.estimator.inputs.pandas_input_fn(
    X_train,
    y_train,
    batch_size=10,
    num_epochs=1000,
    shuffle=True
)

In [None]:
dnn_model = tf.estimator.DNNClassifier(
    hidden_units=[10,10,10], # 3 hidden layers each with 10 neurons
    feature_columns=dnn_feat_cols,
    n_classes=2
)

In [None]:
dnn_model.train(input_fn=input_func, steps=1000)

In [None]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(
    x=X_test,
    y=y_test,
    batch_size=10,
    num_epochs=1,
    shuffle=False
)

In [None]:
dnn_model.evaluate(eval_input_func)