In [None]:
## Imports

from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

import tensorflow.compat.v2.feature_column as fc
import sklearn 
import tensorflow as tf

In [None]:
#Load dataset

dftrain = pd.read_csv(
    'https://storage.googleapis.com/tf-datasets/titanic/train.csv'
    ) #trainingdata

dfeval = pd.read_csv(
    'https://storage.googleapis.com/tf-datasets/titanic/eval.csv'
) # testing data

# We separate the data to see how the model works on new data. 
# Can't just memorize answers then

In [None]:
# These are our outputs, so we are going to keep this 
# Column separate 
y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')

##**Training and Testing Datasets**

Above we loaded two datasets, one for training and one for testing. 

* We train the data on usually significantly more data than we do to test 
* Goal is to make predictions on NEW data, so we should't test it on old data 

**Categorical vs Numeric Data and Feature Columns**

* *Categorical data* - Data that has categories, and not numeric. Example: Male vs Female
  * We have to transfer this to numeric data somehow. So we can represent female by 0 and male by 1
* *Numeric data* - integer based data 
* **feature columns** - what we feed to our model to make prediction

In [None]:
# Types of data
CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class',
                       'deck', 'embark_town', 'alone']
NUMERIC_COLUMNS = ['age', 'fare']

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocab = dftrain[feature_name].unique() 
  # ^ this is getting all unique values from feature columns
  feature_columns.append(
      tf.feature_column.categorical_column_with_vocabulary_list(feature_name, 
                                                                vocab))
  # ^ creates a column with the feature_name and different vocab associated with it 
  
for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(tf.feature_column.numeric_column(
      feature_name, dtype=tf.float32))
  
feature_columns

[VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.strin

##**Training Process**

Sometimes we have a lot of data, and we don't have enough RAM to process it. We load the data in batches in order to overcome this. 

We give 32 entries at once (faster than 1 at a time). We feed batches multiple times according to number of **epochs**

An **epoch** is feeding the data in a different order to the model. This makes the model 'see' the data in a different way
  * This can cause the model to overfit, so to prevent this we make sure we start on a lower amount of epochs and build up to more

To now feed our data, with need to create an **input function**

##Input Functions

This is the way we define how our data is broken into epochs to feed to our data. Likely never have to write one from scratch. 

In [None]:
def make_input_fn(data_df, label_df, 
                  num_epochs=10, shuffle=True, batch_size=32):
  # data_df = data frame being base
  # label_df = labels in df form
  # num_epochs = number of epochs
  # shuffle = Wether we will shuffle or not 
  # batch size = size of the batch 
  def input_function():
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    # ^ create a dict representation of df and pass label df to create an object
    if shuffle:
      ds = ds.shuffle(1000) # shuffles data set 
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds
  return input_function # returns function object 

train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

##**Creating the Model**



In [None]:
# creating the model
linear_est = tf.estimator.LinearClassifier(
    feature_columns=feature_columns) 
# this creates the linear regression model from tf lib

# training model
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)

clear_output() # clear the console

# notice how if we run it multiple times, our accuracy changes
# this is cuz our data changes with every shuffle which leds to new results 
result

{'accuracy': 0.77272725,
 'accuracy_baseline': 0.625,
 'auc': 0.833517,
 'auc_precision_recall': 0.792598,
 'average_loss': 0.48287693,
 'global_step': 200,
 'label/mean': 0.375,
 'loss': 0.47366652,
 'precision': 0.71910113,
 'prediction/mean': 0.337282,
 'recall': 0.64646465}

TensorFlow is much better at working with large data sets instead of just one point. In this case, a set of people vs one passenger 

In [None]:
# Let's predict one person
result = list(linear_est.predict(eval_input_fn)) # loops through
clear_output()
result[0] # list of each prediction

{'all_class_ids': array([0, 1], dtype=int32),
 'all_classes': array([b'0', b'1'], dtype=object),
 'class_ids': array([0]),
 'classes': array([b'0'], dtype=object),
 'logistic': array([0.06220323], dtype=float32),
 'logits': array([-2.713126], dtype=float32),
 'probabilities': array([0.9377968 , 0.06220326], dtype=float32)}

Let's look closer at `'probabilities': array([0.9377968 , 0.06220326], dtype=float32)}`. This is saying there's a 93.8% change they don't survivce, and 6.2% change they do. If there were more classifications, the model would break it up more.  

So, if we wanted to get the specific chance that they survive, we would just do 

`result[0]['probabilities'][1]`

In [None]:
# Let's see the person and see if it makes sense

print(dfeval.loc[0])
print(y_eval.loc[0])

sex                          male
age                          35.0
n_siblings_spouses              0
parch                           0
fare                         8.05
class                       Third
deck                      unknown
embark_town           Southampton
alone                           y
Name: 0, dtype: object
0


We see this person was male, single, no sibling, third class, and alone, so they had a low % chance of surviving. This makes sense and they didn't survive.

For others, it may not be ass accurate 