In [22]:
# %tensorflow_version 2.x
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

import tensorflow.compat.v2.feature_column as fc
# import tensorflow.feature_column as fc
# import tensorflow.keras.utils.

import tensorflow as tf

In [23]:
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv')
dfeval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv')
y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')

In [24]:
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
numerical_columns = ['age', 'fare']

feature_columns = []
for feature_name in categorical_columns:
    vocabulary = dftrain[feature_name].unique() # get all unique values from the given feature column
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in numerical_columns:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

feature_columns

[VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.strin

## Training Process
- Could do all data at once, but that breaks for large datasets
- Instead, do batches. In this case, batches of 32. Batches of 1 would be slower
- Epochs: How many times the model will see the same data
    - First time: kinda works, not a great fit
    - Second time: improved fit
    - Multiple times: best fit
    - Overfitting: in some models, if shown the same data too many times, the model essentially memorizes the train data and doesn't work well for other data

### Input Function
- Input Function: function that decides how we divide the training data and send it to the training function

In [25]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    def input_function(): # inner function to return
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df)) # create tf.data.Dataset object with data and its labels
        if shuffle:
            ds = ds.shuffle(1000) # randomize order of data
        ds = ds.batch(batch_size).repeat(num_epochs) # split dataset into batches of 32 and repeat process for number of epochs
        return ds
    return input_function

train_input_fn = make_input_fn(dftrain, y_train) # calling the input function that was returned to us to get a dataset object we can feed to the model
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

# Creating the Model

In [26]:
linear_est = tf.estimator.LinearClassifier(feature_columns)

linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)
# clear_output()
result['accuracy']

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\miraf\\AppData\\Local\\Temp\\tmpxjx7cwd_', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorf

0.75

In [27]:
result

{'accuracy': 0.75,
 'accuracy_baseline': 0.625,
 'auc': 0.8249159,
 'auc_precision_recall': 0.78838,
 'average_loss': 0.5348224,
 'label/mean': 0.375,
 'loss': 0.5323756,
 'precision': 0.6386555,
 'prediction/mean': 0.49597764,
 'recall': 0.7676768,
 'global_step': 200}

- so far, created and trained the model
### Using the model
- designed to use with large batches of data, not for singular datapoints
- 

In [28]:
result = list(linear_est.predict(eval_input_fn))
result

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\miraf\AppData\Local\Temp\tmpxjx7cwd_\model.ckpt-200
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'logits': array([-1.7670368], dtype=float32),
  'logistic': array([0.14591122], dtype=float32),
  'probabilities': array([0.8540888 , 0.14591123], dtype=float32),
  'class_ids': array([0], dtype=int64),
  'classes': array([b'0'], dtype=object),
  'all_class_ids': array([0, 1]),
  'all_classes': array([b'0', b'1'], dtype=object)},
 {'logits': array([0.6961032], dtype=float32),
  'logistic': array([0.66732323], dtype=float32),
  'probabilities': array([0.33267674, 0.66732323], dtype=float32),
  'class_ids': array([1], dtype=int64),
  'classes': array([b'1'], dtype=object),
  'all_class_ids': array([0, 1]),
  'all_classes': array([b'0', b'1'], dtype=object)},
 {'logits': array([2.2766676], dtype=float32),
  'logistic': array([0.90692616], dtype=float32),
  'probabilities': array([0.09307386, 0.90692616], dtype=float32),
  'class_ids': array([1], dtype=int64),
  'classes': array([b'1'], dtype=object),
  'all_class_ids': array([0, 1]),
  'all_classes': array([b'0', b'1'], dtype=object)},


- we get a dictionary of prediction info for each data point
- need to look at each dict to figure out what the prediction was
- the probabilities value is what we want

In [29]:
result[0]
# dictionary for 1 prediction

{'logits': array([-1.7670368], dtype=float32),
 'logistic': array([0.14591122], dtype=float32),
 'probabilities': array([0.8540888 , 0.14591123], dtype=float32),
 'class_ids': array([0], dtype=int64),
 'classes': array([b'0'], dtype=object),
 'all_class_ids': array([0, 1]),
 'all_classes': array([b'0', b'1'], dtype=object)}

- not survival prediction pct is 0.854, survive is 0.146

In [31]:
result[0]['probabilities'][1] # survival chance prediction

0.14591123

Does 14% survival make sense?

In [32]:
dfeval.loc[0]

sex                          male
age                          35.0
n_siblings_spouses              0
parch                           0
fare                         8.05
class                       Third
deck                      unknown
embark_town           Southampton
alone                           y
Name: 0, dtype: object

14% survival for 35yo third class male with no sibs or spouses sounds right
- we can check some more people

In [33]:
y_eval.loc[0] # yes, this guy died

0