Welcome to the year 2912, where your data science skills are needed to solve a cosmic mystery. We've received a transmission from four lightyears away and things aren't looking good.

The Spaceship Titanic was an interstellar passenger liner launched a month ago. With almost 13,000 passengers on board, the vessel set out on its maiden voyage transporting emigrants from our solar system to three newly habitable exoplanets orbiting nearby stars.

While rounding Alpha Centauri en route to its first destination—the torrid 55 Cancri E—the unwary Spaceship Titanic collided with a spacetime anomaly hidden within a dust cloud. Sadly, it met a similar fate as its namesake from 1000 years before. Though the ship stayed intact, almost half of the passengers were transported to an alternate dimension!



To help rescue crews and retrieve the lost passengers, you are challenged to predict which passengers were transported by the anomaly using records recovered from the spaceship’s damaged computer system.

Help save them and change history!

# Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Normalization
from tensorflow.keras.models import Sequential, load_model
from tensorflow.data import Dataset
from tensorflow import feature_column
import matplotlib.pyplot as plt
# pip install seaborn
import seaborn as sns
import re
import math
from sklearn.model_selection import train_test_split, StratifiedKFold

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
%config InlineBackend.figure_format = 'retina'
pd.set_option("display.precision", 2)
BASE_DIR = '/kaggle/input/spaceship-titanic'
BASE_DIR_OUTPUT = '/kaggle/working/'

# Facets

In [None]:
# For facets
from IPython.core.display import display, HTML
import base64
!pip install facets-overview
from facets_overview.feature_statistics_generator import FeatureStatisticsGenerator

def displayFacetStatistics(df):
    fsg = FeatureStatisticsGenerator()
    dataframes = [
        {'table': df, 'name': 'trainData'}]
    spaceshipProto = fsg.ProtoFromDataFrames(dataframes)
    protostr = base64.b64encode(spaceshipProto.SerializeToString()).decode("utf-8")

    HTML_TEMPLATE = """<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
            <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
            <facets-overview id="elem"></facets-overview>
            <script>
              document.querySelector("#elem").protoInput = "{protostr}";
            </script>"""
    html = HTML_TEMPLATE.format(protostr=protostr)
    display(HTML(html))
    
def displayFacetsDive(df):
    SAMPLE_SIZE = len(df.index) - 1
    df.dropna(how="any", axis=0, inplace=True)
    train_dive = df.to_json(orient='records')
    HTML_TEMPLATE = """<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
            <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
            <facets-dive id="elem" height="600"></facets-dive>
            <script>
              var data = {jsonstr};
              document.querySelector("#elem").data = data;
            </script>"""
    html = HTML_TEMPLATE.format(jsonstr=train_dive)
    display(HTML(html))

# Functions

In [132]:
ALL_COLUMNS = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Destination', 'HomePlanet', 'Cabin','CryoSleep', 'Transported']
LABEL_COLUMN = ['Transported']
CATEGORICAL_COLUMNS = ['Destination', 'HomePlanet', 'Cabin']
BOOLEAN_COLUMNS = ['CryoSleep']
DROP_COLUMNS = ['Name' , 'VIP']
NUMERICAL_COLUMNS = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
AMENITY_COLUMNS = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']    

def createModel(feature_layer):
    class myModel(tf.keras.Model):
        def __init__(self):
            super(myModel, self).__init__()
            self.feature_layer = feature_layer
            self.normalization = Normalization(mean=0, variance=1)
            self.dense2 = Dense(16, activation='relu')         
            self.dense3 = Dense(1, activation='sigmoid')
            self.dropout1 = Dropout(0.4)

        def call(self, inputs, training=False):
            print(inputs)
            x = self.feature_layer(inputs)
            x = self.normalization(x)
            #x = self.dense1(x)            
            x = self.dense2(x)   
            if(training==False):
                x = self.dropout1(x, training=training)
            return self.dense3(x)

    model = myModel()       

    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), 
                  loss=tf.keras.losses.BinaryCrossentropy(), 
                  metrics=['accuracy'])
    
    return model

def train_model(model, train_ds):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)  
    history = model.fit(train_ds, epochs= 100, callbacks=[early_stopping])
    model.save(os.path.join(BASE_DIR_OUTPUT, 'spaceship_model'), save_format='tf')

def make_predictions(ds):
    
    model = load_model(os.path.join(BASE_DIR_OUTPUT, 'spaceship_model'))
    predictions = model.predict(ds)
    predictions = np.round(predictions)
    predictions = [True if pred == 1 else False for pred in predictions]
    
    passenger_id = np.concatenate(np.array([x['PassengerId'].numpy() for x in ds]), axis=0)
    passenger_id = np.array([x.decode("utf-8") for x in passenger_id])
    passenger_id = np.expand_dims(passenger_id, axis=1)
    preds = np.array(predictions, ndmin=2)
    
    preds = np.concatenate((passenger_id, preds.T), axis=1)
    df = pd.DataFrame(preds)
    predictions_file = os.path.join(BASE_DIR_OUTPUT, 'predictions.csv')
    
    if(os.path.exists(predictions_file)):
        os.remove(predictions_file)
            
    df.to_csv(path_or_buf=predictions_file, header=['PassengerId', 'Transported'], index=False)
        
def df_to_dataset(dataframe, shuffle=True, batch_size=32, test=False):
    dataframe = dataframe.copy()    
    if(not test):
        labels = dataframe.pop(LABEL_COLUMN[0])
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    else:
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe)))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds     

def fill_na(df):
    values = {col: df[col].value_counts().idxmax() for col in df.columns}
    df.fillna(values, inplace=True)
    return df

def createFeatureColumns():
    feature_columns = []
    # numeric cols
    for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        feature_columns.append(feature_column.numeric_column(col))

    # bucketized cols
    age = feature_column.numeric_column('Age')
    age_buckets = feature_column.bucketized_column(age, boundaries=[0, 20, 40])
    feature_columns.append(age_buckets)

    # indicator_columns
    indicator_column_names = ['HomePlanet', 'Destination']
    for col_name in indicator_column_names:
        categorical_column = feature_column.categorical_column_with_vocabulary_list(
            col_name, train_df[col_name].unique())
        indicator_column = feature_column.indicator_column(categorical_column)
        feature_columns.append(indicator_column)

    # embedding columns
    cabin = feature_column.categorical_column_with_vocabulary_list(
        'Cabin', train_df.Cabin.unique())
    cabin_embedding = feature_column.embedding_column(cabin, dimension=8)
    feature_columns.append(cabin_embedding)
    return feature_columns

# Dataset

In [125]:
import warnings
warnings.filterwarnings("ignore")
BASE_DIR = '/kaggle/input/spaceship-titanic'
train_df = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'), index_col=0)
test_df = pd.read_csv(os.path.join(BASE_DIR, 'test.csv'), index_col=0)
train_df = train_df.reset_index()
test_df = test_df.reset_index()
display(train_df)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# Data Analysis

In [None]:
df = train_df.drop('Transported', axis=1)
displayFacetStatistics(df)

#displayFacetsDive(df)

# Preprocess Data

In [136]:
BASE_DIR = '/kaggle/input/spaceship-titanic'
train_df = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'), index_col=0)
test_df = pd.read_csv(os.path.join(BASE_DIR, 'test.csv'), index_col=0)
train_df = train_df.reset_index()
test_df = test_df.reset_index()
# with tf.dataset
batch_size = 100
#train_df = fill_na(train_df)
#test_df = fill_na(test_df)
train_df[BOOLEAN_COLUMNS] = [1 if x == True else 0 for x in train_df[BOOLEAN_COLUMNS]]
train_df.drop(columns=DROP_COLUMNS, inplace=True)
values={'Cabin': '', 'Destination': '', 'HomePlanet': ''}
train_df = train_df.fillna(values)
display(train_df.info())
test_df = test_df.fillna(method='ffill')

train_ds = df_to_dataset(train_df, batch_size=batch_size)
#test_ds = df_to_dataset(test_df, test=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   int64  
 3   Cabin         8693 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8514 non-null   float64
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), int64(1), object(4)
memory usage: 755.7+ KB


None

In [None]:
print(next(iter(train_ds))[0])

# Create feature columns

In [None]:
feature_columns = createFeatureColumns()
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

# Train Model

In [None]:
model = createModel(feature_layer)
train_model(model, train_ds)

In [None]:
make_predictions(test_ds)

In [None]:
features = feature_layer(next(iter(train_ds))[0])
display(pd.DataFrame(features.numpy()))
display(pd.DataFrame(layer(features.numpy()).numpy()))

In [None]:
os.remove(os.path.join(BASE_DIR_OUTPUT, 'predictions.csv'))

In [None]:
import keras
from matplotlib import pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
def pandas_to_numpy(data):
    '''Convert a pandas DataFrame into a Numpy array'''
  # Drop empty rows.
    data = data.dropna(how="any", axis=0)
  # Separate DataFrame into two Numpy arrays.
    labels = np.array(data['Transported'])
    features = data.drop('Transported', axis=1)
    features = {name:np.array(value) for name, value in features.items()}
  
    return features, labels

#@title Visualize Binary Confusion Matrix and Compute Evaluation Metrics Per Subgroup
CATEGORY  =  "HomePlanet" #@param {type:"string"}
SUBGROUP =  1 #@param {type:"int"}

# Labels for annotating axes in plot.
classes = ['Transported', 'Not Transported']

# Given define subgroup, generate predictions and obtain its corresponding 
# ground truth.
subgroup_filter  = train_df.loc[train_df[CATEGORY] == SUBGROUP]
features, labels = pandas_to_numpy(subgroup_filter)

subgroup_results = model.evaluate(x=features, y=labels, verbose=0)
display(subgroup_filter)
confusion_matrix = np.array([[subgroup_results[1], subgroup_results[4]], 
                             [subgroup_results[2], subgroup_results[3]]])

subgroup_performance_metrics = {
    'ACCURACY': subgroup_results[5],
    'PRECISION': subgroup_results[6], 
    'RECALL': subgroup_results[7],
    'AUC': subgroup_results[8]
}
performance_df = pd.DataFrame(subgroup_performance_metrics, index=[SUBGROUP])
pd.options.display.float_format = '{:,.4f}'.format

plot_confusion_matrix(confusion_matrix, classes, SUBGROUP);
performance_df

def plot_confusion_matrix(
    confusion_matrix, class_names, subgroup, figsize = (8,6)):
 
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )

    rcParams.update({
      'font.family':'sans-serif',
      'font.sans-serif':['Liberation Sans'],
    })
  
    sns.set_context("notebook", font_scale=1.25)

    fig = plt.figure(figsize=figsize)

    plt.title('Confusion Matrix for Performance Across ' + subgroup)

    # Combine the instance (numercial value) with its description
    strings = np.asarray([['True Positives', 'False Negatives'],
                          ['False Positives', 'True Negatives']])
    labels = (np.asarray(
        ["{0:g}\n{1}".format(value, string) for string, value in zip(
            strings.flatten(), confusion_matrix.flatten())])).reshape(2, 2)

    heatmap = sns.heatmap(df_cm, annot=labels, fmt="", 
        linewidths=2.0, cmap=sns.color_palette("GnBu_d"));
    heatmap.yaxis.set_ticklabels(
        heatmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    heatmap.xaxis.set_ticklabels(
        heatmap.xaxis.get_ticklabels(), rotation=45, ha='right')
    plt.ylabel('References')
    plt.xlabel('Predictions')
    return fig

def createTrainEvalSets(df):
    df1 = df[np.mod(np.abs(df['Name'].apply(lambda item: hash(item.split(" ")[1]))), 4) < 3]
    df2 = df[np.mod(np.abs(df['Name'].apply(lambda item: hash(item.split(" ")[1]))), 4) >= 3]
    return (df1, df2)
    
def displayRowsWithNulls(df, columns):
    display(df[columns][df[columns].isnull().any(axis=1)])



In [124]:
dataset = tf.data.Dataset.from_tensor_slices([8, 3, 0, 8, 2, 1, True])
for elem in dataset:
    print(elem.numpy())

8
3
0
8
2
1
1
