In [10]:
import tensorflow as tf
import pandas as pd


In [11]:
#Step 1. Get the Data : Import and parse the data sets
#Step 2. Feature Engineering : Create feature columns to describe the data
#Step 3. Select ML Model : Select the type of model
#Step 4. Train your model : Train the model
#Step 5. Evaluate the model : Evaluate the model's effectiveness
#Step 6. Roll out and let the model make predictions: Let the trained model make predictions

In [12]:
# Define the training inputs
def get_train_inputs(batch_size, mnist_data):
    """Return the input function to get the training data.
    Args:
        batch_size (int): Batch size of training iterator that is returned
                          by the input function.
        mnist_data (Object): Object holding the loaded mnist data.
    Returns:
        (Input function, IteratorInitializerHook):
            - Function that returns (features, labels) when called.
            - Hook to initialise input iterator.
    """
    iterator_initializer_hook = IteratorInitializerHook()

    def train_inputs():
        """Returns training set as Operations.
        Returns:
            (features, labels) Operations that iterate over the dataset
            on every evaluation
        """
        with tf.name_scope('Training_data'):
            # Get Mnist data
            images = mnist_data.train.images.reshape([-1, 28, 28, 1])
            labels = mnist_data.train.labels
            # Define placeholders
            images_placeholder = tf.placeholder(
                images.dtype, images.shape)
            labels_placeholder = tf.placeholder(
                labels.dtype, labels.shape)
            # Build dataset iterator
            dataset = tf.contrib.data.Dataset.from_tensor_slices(
                (images_placeholder, labels_placeholder))
            dataset = dataset.repeat(None)  # Infinite iterations
            dataset = dataset.shuffle(buffer_size=10000)
            dataset = dataset.batch(batch_size)
            iterator = dataset.make_initializable_iterator()
            next_example, next_label = iterator.get_next()
            # Set runhook to initialize iterator
            iterator_initializer_hook.iterator_initializer_func = \
                lambda sess: sess.run(
                    iterator.initializer,
                    feed_dict={images_placeholder: images,
                               labels_placeholder: labels})
            # Return batched (features, labels)
            return next_example, next_label

    # Return function and hook
    return train_inputs, iterator_initializer_hook

In [138]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [139]:
print(train_df.columns.values)

['1' '0' '3' 'Braund, Mr. Owen Harris' 'male' '22' '1.1' '0.1' 'A/5 21171'
 '7.25' 'Unnamed: 10' 'S']


In [140]:
dataset = tf.data.TextLineDataset('./train.csv')


In [141]:
_CSV_COLUMNS = [
    'PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'
]

In [142]:
print(dataset)

<TextLineDataset shapes: (), types: tf.string>


In [237]:
def input_fn(data_file, num_epochs, shuffle, batch_size,test):
  """Generate an input function for the Estimator."""
  assert tf.gfile.Exists(data_file), (
      '%s not found. Please make sure you have either run data_download.py or '
      'set both arguments --train_data and --test_data.' % data_file)

  def parse_csv_test(value):
    print('Parsing', data_file)
    columns = tf.decode_csv(value, record_defaults= [ [''],['Pclass'],[''],[''],[-1.0],[-1],[-1],[''],[0.0],[''],['']])
    features = dict(zip(_CSV_COLUMNS, columns))
    print(str(features.keys))
    return features

    
  def parse_csv(value):
    print('Parsing', data_file)
    columns = tf.decode_csv(value, record_defaults= [ [''],[0],['Pclass'],[''],[''],[-1.0],[-1],[-1],[''],[0.0],[''],['']])
    features = dict(zip(_CSV_COLUMNS, columns))
    print(str(features.keys))
    labels = features.pop('Survived')
    return features, labels

  # Extract lines from input files using the Dataset API.
  dataset = tf.data.TextLineDataset(data_file).skip(1)

  if test==False:
    dataset = dataset.map(parse_csv)
  else:
    dataset = dataset.map(parse_csv_test)
    

  # We call repeat after shuffling, rather than before, to prevent separate
  # epochs from blending together.
  dataset = dataset.repeat(num_epochs)
  dataset = dataset.batch(batch_size)

  iterator = dataset.make_one_shot_iterator()
  features, labels = iterator.get_next()
  return features, labels

In [238]:
#Questions :
#1. Why do you need epochs when you are reading a file ?
#2. Why do you need batch size when reading the file ?


In [239]:
features, labels = input_fn('./train.csv', 10, False, 10, False)

('Parsing', './train.csv')
<built-in method keys of dict object at 0x1813a7c910>


In [240]:
classifier = tf.estimator.LinearClassifier(
    feature_columns=[
    tf.feature_column.categorical_column_with_hash_bucket('Pclass',hash_bucket_size=50 ),
    tf.feature_column.categorical_column_with_hash_bucket('Sex', hash_bucket_size=2),
    tf.feature_column.numeric_column('Age'),
    tf.feature_column.numeric_column('SibSp'),
    tf.feature_column.numeric_column('Parch'),
    tf.feature_column.numeric_column('Fare'),
    tf.feature_column.categorical_column_with_hash_bucket('Cabin', hash_bucket_size=4),
    tf.feature_column.categorical_column_with_hash_bucket('Embarked', hash_bucket_size=3)
])

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x181265a050>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/var/folders/yk/xmj889zj7q353d_cn3k5kqrr9h03ws/T/tmp7A_Phw', '_save_summary_steps': 100}


In [243]:
classifier.train(input_fn=lambda: input_fn('./train.csv', 10, False, 10, False), steps=10000)

('Parsing', './train.csv')
<built-in method keys of dict object at 0x1812075e88>
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /var/folders/yk/xmj889zj7q353d_cn3k5kqrr9h03ws/T/tmp7A_Phw/model.ckpt-891
INFO:tensorflow:Saving checkpoints for 892 into /var/folders/yk/xmj889zj7q353d_cn3k5kqrr9h03ws/T/tmp7A_Phw/model.ckpt.
INFO:tensorflow:loss = 2.2212, step = 892
INFO:tensorflow:global_step/sec: 126.836
INFO:tensorflow:loss = 5.61128, step = 992 (0.789 sec)
INFO:tensorflow:global_step/sec: 313.214
INFO:tensorflow:loss = 5.922, step = 1092 (0.319 sec)
INFO:tensorflow:global_step/sec: 316.948
INFO:tensorflow:loss = 3.21539, step = 1192 (0.315 sec)
INFO:tensorflow:global_step/sec: 314.835
INFO:tensorflow:loss = 4.75093, step = 1292 (0.318 sec)
INFO:tensorflow:global_step/sec: 319.577
INFO:tensorflow:loss = 6.59031, step = 1392 (0.313 sec)
INFO:tensorflow:global_step/sec: 313.121
INFO:tensorflow:loss = 7.64318, step = 1492 (0.320 sec)
INFO:tensorflow:glo

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1812ee4050>

In [246]:
results = classifier.evaluate(input_fn=lambda: input_fn('./test.csv', 10, False, 10, True))

('Parsing', './test.csv')
<built-in method keys of dict object at 0x1812a177f8>


ValueError: too many values to unpack

In [247]:
pred_iter = classifier.predict(input_fn=lambda: input_fn("./test.csv", 1, False, 1))
for pred in pred_iter:
    print(pred['classes'])


TypeError: input_fn() takes exactly 5 arguments (4 given)