In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os


# Any results you write to the current directory are saved as output.

In [2]:
input_dir = '../data/'

train_csv = pd.read_csv(input_dir + 'train.csv')
train_csv.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
307,308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (M...",female,17.0,1,0,PC 17758,108.9,C65,C
510,511,1,3,"Daly, Mr. Eugene Patrick",male,29.0,0,0,382651,7.75,,Q
200,201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28.0,0,0,345770,9.5,,S
533,534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C
649,650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23.0,0,0,CA. 2314,7.55,,S
623,624,0,3,"Hansen, Mr. Henry Damsgaard",male,21.0,0,0,350029,7.8542,,S
279,280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.25,,S
824,825,0,3,"Panula, Master. Urho Abraham",male,2.0,4,1,3101295,39.6875,,S
656,657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S
402,403,0,3,"Jussila, Miss. Mari Aina",female,21.0,1,0,4137,9.825,,S


In [3]:
unique_count_df = pd.Series()
for col in train_csv.columns:
    unique_count_df[col] = len(train_csv[col].unique())
unique_count_df

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             89
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          148
Embarked         4
dtype: int64

In [4]:
import tensorflow as tf

In [5]:
def split_data(df: pd.DataFrame):
    df_shuffled = df.sample(frac=1).reset_index(drop=True)
    num_rows = df.shape[0]
    num_rows_train_data = int(num_rows * 0.8)

    train_data = df_shuffled[:num_rows_train_data]
    val_data = df_shuffled[num_rows_train_data:]
    
    return (train_data, val_data)

def input_fn(df: pd.DataFrame, labels, batch_size: int, num_epochs: int):
    if labels is None:
        input = df.to_dict(orient='series')
    else:
        input = (df.to_dict(orient='series'), labels)
    dataset = tf.data.Dataset.from_tensor_slices(input)
    return dataset.shuffle(buffer_size=10000).repeat(count=num_epochs).batch(batch_size)

def eval_input_fn(df: pd.DataFrame, labels):
    if labels is None:
        input = df.to_dict(orient='series')
    else:
        input = (df.to_dict(orient='series'), labels)
    dataset = tf.data.Dataset.from_tensor_slices(input)
    return dataset.batch(128)

def embedding_dimension(unique_count):
    return min(50, unique_count // 2)

def define_feature_columns(df: pd.DataFrame, numeric_columns, categorical_columns):
    feature_columns = []
    for col in df.columns:
        if col in categorical_columns:
            sorted_unique_values = sorted(set(list(df[col].unique()) + ['']))
            cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=col, vocabulary_list=sorted_unique_values)
            embedding_dim = embedding_dimension(len(sorted_unique_values))
            if embedding_dim <= 2:
                feature_columns.append((col, tf.feature_column.indicator_column(cat_col)))
            else:
                feature_columns.append((col, tf.feature_column.embedding_column(cat_col, embedding_dim)))
        elif col in numeric_columns:
            feature_columns.append((col, tf.feature_column.numeric_column(key=col)))
            isnull_col_name = col + 'IsNull'
            feature_columns.append((isnull_col_name, tf.feature_column.numeric_column(key=isnull_col_name)))
    return dict(feature_columns)

def feature_preprocess(df: pd.DataFrame, numeric_columns, categorical_columns, col_mean, col_stddev):
    processed = pd.DataFrame()
    processed['PassengerId'] = df['PassengerId']
    
    for col_name in numeric_columns:
        processed[col_name] = (df[col_name].astype(float) - col_mean[col_name]) / col_stddev[col_name]
        isnull_col_name = col_name + 'IsNull'
        processed[isnull_col_name] = df[col_name].isnull()
        processed.loc[processed[isnull_col_name], col_name] = 0
        processed[isnull_col_name] = processed[isnull_col_name].astype(np.int8)
        
    for col_name in categorical_columns:
        processed[col_name] = df[col_name].copy().astype(str)
        processed.loc[df[col_name].isnull(), col_name] = ''
    
    return processed

In [6]:
input_dir = '../data/'

train_csv = pd.read_csv(input_dir + 'train.csv')
test_csv = pd.read_csv(input_dir + 'test.csv')

train_csv['Deck'] = train_csv['Cabin'].str[0]
test_csv['Deck'] = test_csv['Cabin'].str[0]

train_data, val_data = split_data(train_csv)
print(train_data.shape, val_data.shape)

(712, 13) (179, 13)


In [7]:
numeric_columns = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_columns = ['Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Deck']

col_mean = train_data[numeric_columns].mean()
col_stddev = train_data[numeric_columns].std()

def feature_preprocess_1(df: pd.DataFrame):
    return feature_preprocess(df, numeric_columns=numeric_columns, categorical_columns=categorical_columns, col_mean=col_mean, col_stddev=col_stddev)

train_features = feature_preprocess_1(train_data)
val_features = feature_preprocess_1(val_data)

print(train_features.head())

feature_columns = define_feature_columns(
    train_features,
    numeric_columns=numeric_columns,
    categorical_columns=categorical_columns)

classifier = tf.estimator.DNNClassifier(
    feature_columns=list(feature_columns.values()),
    hidden_units=[64, 32, 16],
    n_classes=2,
    dropout=0.5)

   PassengerId       Age  AgeIsNull     SibSp  SibSpIsNull     Parch  \
0           83  0.000000          1 -0.466103            0 -0.476682   
1          204  1.063727          0 -0.466103            0 -0.476682   
2          775  1.647992          0  0.404934            0  3.225839   
3          738  0.341988          0 -0.466103            0 -0.476682   
4          414  0.000000          1 -0.466103            0 -0.476682   

   ParchIsNull      Fare  FareIsNull Pclass     Sex    Ticket Cabin Embarked  \
0            0 -0.483357           0      3  female    330932              Q   
1            0 -0.494085           0      3    male      2628              C   
2            0 -0.193224           0      2  female     29105              S   
3            0  9.139273           0      1    male  PC 17755  B101        C   
4            0 -0.631880           0      2    male    239853              S   

  Deck  
0       
1       
2       
3    B  
4       
INFO:tensorflow:Using default co

In [8]:
classifier.train(input_fn=lambda: input_fn(train_features, train_data['Survived'], batch_size=64, num_epochs=500))

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into C:\Users\HPE\AppData\Local\Temp\tmpdlymsfc4\model.ckpt.
INFO:tensorflow:loss = 45.10874, step = 1
INFO:tensorflow:global_step/sec: 299.168
INFO:tensorflow:loss = 2.2042937, step = 101 (0.335 sec)
INFO:tensorflow:global_step/sec: 593.298
INFO:tensorflow:loss = 4.243692, step = 201 (0.169 sec)
INFO:tensorflow:global_step/sec: 600.404
INFO:tensorflow:loss = 0.4422749, step = 301 (0.167 sec)
INFO:tensorflow:global_step/sec: 604.02
INFO:tensorflow:loss = 0.60483193, step = 401 (0.166 sec)
INFO:tensorflow:global_step/sec: 596.831
INFO:tensorflow:loss = 0.6782328, step = 501 (0.168 s

INFO:tensorflow:global_step/sec: 544.414
INFO:tensorflow:loss = 2.2616625, step = 4301 (0.185 sec)
INFO:tensorflow:global_step/sec: 600.401
INFO:tensorflow:loss = 0.63568264, step = 4401 (0.167 sec)
INFO:tensorflow:global_step/sec: 600.408
INFO:tensorflow:loss = 1.1261826, step = 4501 (0.167 sec)
INFO:tensorflow:global_step/sec: 600.403
INFO:tensorflow:loss = 8.074184e-05, step = 4601 (0.167 sec)
INFO:tensorflow:global_step/sec: 661.598
INFO:tensorflow:loss = 7.696128e-07, step = 4701 (0.150 sec)
INFO:tensorflow:global_step/sec: 582.955
INFO:tensorflow:loss = 0.007832676, step = 4801 (0.174 sec)
INFO:tensorflow:global_step/sec: 681.22
INFO:tensorflow:loss = 0.6298303, step = 4901 (0.145 sec)
INFO:tensorflow:global_step/sec: 579.001
INFO:tensorflow:loss = 0.18783337, step = 5001 (0.174 sec)
INFO:tensorflow:global_step/sec: 631.931
INFO:tensorflow:loss = 1.0813136, step = 5101 (0.158 sec)
INFO:tensorflow:global_step/sec: 621.337
INFO:tensorflow:loss = 1.4457684, step = 5201 (0.160 sec)
I

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier at 0x1f0b6f7ae48>

In [9]:
classifier.evaluate(input_fn=lambda: eval_input_fn(val_features, val_data['Survived']))

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-24T08:38:53Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from C:\Users\HPE\AppData\Local\Temp\tmpdlymsfc4\model.ckpt-5563
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-09-24-08:38:54
INFO:tensorflow:Saving dict for global step 5563: accuracy = 0.7821229, accuracy_baseline = 0.59217876, auc = 0.79329276, auc_precision_recall = 0.79888445, average_loss = 5.063408, global_step = 5563, label/mean = 0.40782124, loss = 453.175, precision = 0.7297297, prediction/mean = 0.41330647, recall = 0.739726
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5563: C:\Users\HPE\AppData\Local\Temp\tmpd

{'accuracy': 0.7821229,
 'accuracy_baseline': 0.59217876,
 'auc': 0.79329276,
 'auc_precision_recall': 0.79888445,
 'average_loss': 5.063408,
 'label/mean': 0.40782124,
 'loss': 453.175,
 'precision': 0.7297297,
 'prediction/mean': 0.41330647,
 'recall': 0.739726,
 'global_step': 5563}

In [10]:
submit_data = feature_preprocess_1(test_csv)

predictions = classifier.predict(input_fn=lambda: eval_input_fn(submit_data, None))
predictions = list(predictions)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\HPE\AppData\Local\Temp\tmpdlymsfc4\model.ckpt-5563
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [13]:
pred = [x['class_ids'][0] for x in predictions]

In [14]:
X_tmp =  pd.read_csv('../data/test.csv')
X_tmp['Survived'] = pred
sub = X_tmp[['PassengerId', 'Survived']]
sub.head()
sub.to_csv('submissionDNNC.csv', index = False)