<a href="https://colab.research.google.com/github/namanmeena/learning_opencv2/blob/master/Titanic_TensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Titanic Survival Prediction using TensorFlow

In [29]:
%%bash

token='' # Put your token
token_dir="/root/.kaggle"
competition="titanic"
dataset_path="kaggle_datasets"
competition_dataset_path="$dataset_path/$competition"

# Removing existing dir
rm -rf $token_dir
rm -rf $competition_dataset_path

# Creating directories
mkdir -p $token_dir
mkdir -p $competition_dataset_path

# Placing kaggle token
echo $token > $token_dir/kaggle.json

# Giving permission to token
chmod 600 $token_dir/kaggle.json

# Downloading dataset using kaggle
kaggle competitions download -c $competition -p $competition_dataset_path

401 - Unauthorized


In [5]:
_data_root = "kaggle_datasets/titanic"
_data_filepath = f"{_data_root}/train.csv"

In [8]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
import pandas as pd

In [9]:
tf.__version__

'2.3.0'

In [10]:
df = pd.read_csv(_data_filepath)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
len(df)

891

In [12]:
df = df.dropna(subset=['Embarked'])
len(df)

889

In [13]:
numeric_features = []
categorical_features = []

In [14]:
for i in ["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]:
    column = list(df[i].unique())
    if len(column) > 10:
        print(f"{i} - {len(column)}")
    else:
        print(f"{i} - {len(column)} - {column}")
    

Survived - 2 - [0, 1]
Pclass - 3 - [3, 1, 2]
Sex - 2 - ['male', 'female']
Age - 89
SibSp - 7 - [1, 0, 3, 4, 2, 5, 8]
Parch - 7 - [0, 1, 2, 5, 3, 4, 6]
Ticket - 680
Fare - 247
Cabin - 147
Embarked - 3 - ['S', 'C', 'Q']


In [15]:
numeric_features.extend(["Age", "SibSp", "Parch", ""])

#### For more feature columns information, read the [tensorflow-docs](https://www.tensorflow.org/tutorials/structured_data/feature_columns)

In [16]:
def get_categorical_col(col, unique_values):
    return tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(col, unique_values)
    )

def get_embedding_col(col, unique_values, dim):
    return tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(col, unique_values),
        dimension = dim
    )


In [17]:
feature_column = []

In [18]:
for i in ["Pclass", "Sex", "Embarked"]:
    feature_column.append(get_categorical_col(i, df[i].unique()))

In [19]:
for i in ["Parch", "SibSp"]:
    feature_column.append(get_embedding_col(i, df[i].unique(), 3))

In [20]:
cols_to_select = ["Pclass", "Sex", "Embarked", "Parch", "SibSp", "target"]

In [21]:
feature_column

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Pclass', vocabulary_list=(3, 1, 2), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Embarked', vocabulary_list=('S', 'C', 'Q'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 EmbeddingColumn(categorical_column=VocabularyListCategoricalColumn(key='Parch', vocabulary_list=(0, 1, 2, 5, 3, 4, 6), dtype=tf.int64, default_value=-1, num_oov_buckets=0), dimension=3, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x7ff9247d8080>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True, use_safe_embedding_lookup=True),
 EmbeddingColumn(categorical_column=VocabularyListCategoricalColumn(key='SibSp', vocabu

In [22]:
feature_layer = tf.keras.layers.DenseFeatures(feature_column)

In [23]:
df['target'] = df['Survived']

In [24]:
train, test = train_test_split(df[cols_to_select], test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

568 train examples
143 validation examples
178 test examples


In [25]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=64):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [26]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [27]:
model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.Dropout(.5),
    layers.Dense(256, activation='relu'),
    layers.Dropout(.3),
    layers.Dense(2, activation='softmax')
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=50)

Epoch 1/50
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7ff9105066d8>

In [28]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.7415730357170105
