Welcome to the year 2912, where your data science skills are needed to solve a cosmic mystery. We've received a transmission from four lightyears away and things aren't looking good.

The Spaceship Titanic was an interstellar passenger liner launched a month ago. With almost 13,000 passengers on board, the vessel set out on its maiden voyage transporting emigrants from our solar system to three newly habitable exoplanets orbiting nearby stars.

While rounding Alpha Centauri en route to its first destination—the torrid 55 Cancri E—the unwary Spaceship Titanic collided with a spacetime anomaly hidden within a dust cloud. Sadly, it met a similar fate as its namesake from 1000 years before. Though the ship stayed intact, almost half of the passengers were transported to an alternate dimension!



To help rescue crews and retrieve the lost passengers, you are challenged to predict which passengers were transported by the anomaly using records recovered from the spaceship’s damaged computer system.

Help save them and change history!

In [9]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential, load_model
from tensorflow.data import Dataset
import matplotlib.pyplot as plt
# pip install seaborn
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, Normalizer
import re
import math

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
%config InlineBackend.figure_format = 'retina'
pd.set_option("display.precision", 2)

# For facets
from IPython.core.display import display, HTML
import base64
!pip install facets-overview
from facets_overview.feature_statistics_generator import FeatureStatisticsGenerator

BASE_DIR = '/kaggle/input/spaceship-titanic'
BASE_DIR_OUTPUT = '/kaggle/working/'

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv
[0m

# Functions

In [109]:
#@title Define Function to Visualize Binary Confusion Matrix
def plot_confusion_matrix(
    confusion_matrix, class_names, subgroup, figsize = (8,6)):
  # We're taking our calculated binary confusion matrix that's already in the 
  # form of an array and turning it into a pandas DataFrame because it's a lot 
  # easier to work with a pandas DataFrame when visualizing a heat map in 
  # Seaborn.
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )

    rcParams.update({
      'font.family':'sans-serif',
      'font.sans-serif':['Liberation Sans'],
    })
  
    sns.set_context("notebook", font_scale=1.25)

    fig = plt.figure(figsize=figsize)

    plt.title('Confusion Matrix for Performance Across ' + subgroup)

    # Combine the instance (numercial value) with its description
    strings = np.asarray([['True Positives', 'False Negatives'],
                          ['False Positives', 'True Negatives']])
    labels = (np.asarray(
        ["{0:g}\n{1}".format(value, string) for string, value in zip(
            strings.flatten(), confusion_matrix.flatten())])).reshape(2, 2)

    heatmap = sns.heatmap(df_cm, annot=labels, fmt="", 
        linewidths=2.0, cmap=sns.color_palette("GnBu_d"));
    heatmap.yaxis.set_ticklabels(
        heatmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    heatmap.xaxis.set_ticklabels(
        heatmap.xaxis.get_ticklabels(), rotation=45, ha='right')
    plt.ylabel('References')
    plt.xlabel('Predictions')
    return fig

def process_drop_columns(df, columns):    
    new_df = df.drop(columns=columns, axis=1, errors='ignore')
    return new_df

def proces_boolean(df, columns):
    for col in columns:
        df[col] = [1 if x is True else 0 for x in df[col]]
    return df

def process_categorical(df, columns):         
    le = LabelEncoder()
    label_object = {}
    for col in columns:
        labelencoder = LabelEncoder()
        labelencoder.fit(df[col])
        df[col] = labelencoder.fit_transform(df[col])
        label_object[col] = labelencoder
    return df

def create_hash(df):
    df['Hash'] = df['Surname'].apply(lambda x: hash(tuple(x)))
    return df

def processCabin(df):
    df['Cabin'].replace('\/\d+\/', '/', regex=True, inplace=True)
    return df

def processName(df):
    df['Surname'] = df['Name'].apply(lambda item: item.split(" ")[1])
    return df

def createTrainEvalSets(df):
    df1 = df[np.mod(np.abs(df['Name'].apply(lambda item: hash(item.split(" ")[1]))), 4) < 3]
    df2 = df[np.mod(np.abs(df['Name'].apply(lambda item: hash(item.split(" ")[1]))), 4) >= 3]
    return (df1, df2)
# train_df = processCabin(train_df)

#slower
def fillAmenityAmount2(df, columns):    
    for col in df.T.columns:
        df.T[col].fillna(9, inplace=True)
    return df  
    
    
def displayRowsWithNulls(df, columns):
    display(df[columns][df[columns].isnull().any(axis=1)])

def process_normalize_columns(df, columns):
    for col in columns:
        df[col]=(df[col]-df[col].min())/(df[col].max()-df[col].min())
    return df

def fillAmenityAmount(df, columns, regressor):   
    df1=df[df[columns].isnull().any(axis=1)][columns]    
    for index, row in df1.iterrows():              
        features = df1.loc[index, ~pd.isna(row)]
        
        if(len(features) < 4):
            arr = features.to_numpy()
            arr = np.append(arr, 0)
            features = pd.Series(arr).to_numpy()
        features = np.array(features[..., np.newaxis])     
        
        prediction = regressor.predict(features.T)
        col_name = df1.columns[pd.isna(row)][0]
        
        df.loc[index, col_name] = prediction[0]   
    
    return df

def fillHomePlanet(df, regressor):    
    df1=df[df['HomePlanet'].isna()][['Cabin', 'HomePlanet']]
    for index, row in df1.iterrows():   
        
        features = np.array(df1.loc[index, 'Cabin']) 
        features = features[..., np.newaxis]
        
        prediction = regressor.predict(features)
        
        df.loc[index, 'HomePlanet'] = np.argmax(prediction[0])   
    
    return df

def displayFacetStatistics(df):
    fsg = FeatureStatisticsGenerator()
    dataframes = [
        {'table': df, 'name': 'trainData'}]
    spaceshipProto = fsg.ProtoFromDataFrames(dataframes)
    protostr = base64.b64encode(spaceshipProto.SerializeToString()).decode("utf-8")


    HTML_TEMPLATE = """<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
            <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
            <facets-overview id="elem"></facets-overview>
            <script>
              document.querySelector("#elem").protoInput = "{protostr}";
            </script>"""
    html = HTML_TEMPLATE.format(protostr=protostr)
    display(HTML(html))
    
def displayFacetsDive(df):
    SAMPLE_SIZE = len(df.index) - 1
    df.dropna(how="any", axis=0, inplace=True)
    train_dive = df.to_json(orient='records')
    HTML_TEMPLATE = """<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
            <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
            <facets-dive id="elem" height="600"></facets-dive>
            <script>
              var data = {jsonstr};
              document.querySelector("#elem").data = data;
            </script>"""
    html = HTML_TEMPLATE.format(jsonstr=train_dive)
    display(HTML(html))
    
def createRegressor(data, features, label):    
    class myRegressor(tf.keras.Model):
        def __init__(self):
            super().__init__()        
            self.dense1 = Dense(1, activation='relu')  
            #self.dense2 = Dense(2, activation='relu')         
            self.dense3 = Dense(1)
            self.dropout1 = Dropout(0.4)

        def call(self, inputs, training=False):
            x = self.dense1(inputs)
            if(training==False):
                x = self.dropout1(x, training=training)
            #x = self.dense2(x)
            return self.dense3(x)
        
    myRegressorModel = myRegressor()
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)

    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', 
                                                     factor=0.001,
                                                     patience=5, 
                                                     min_lr=1e-10)

    myRegressorModel.compile(optimizer=tf.keras.optimizers.Adam(), 
                             loss=tf.keras.losses.MeanSquaredError(), 
                             metrics=['accuracy'])

    
    data.dropna(inplace=True)
    features = data[features]
    labels = data[label]
    print('fitting regressor------')
    history = myRegressorModel.fit(features, labels, epochs=100, callbacks=[early_stopping])
    
    return myRegressorModel

def createCategoricalRegressor(data, features, label):
    class myCategoricalRegressor(tf.keras.Model):
        def __init__(self):
            super().__init__()        
            self.dense1 = Dense(128, activation='relu')  
            self.dense2 = Dense(64, activation='relu')         
            self.dense3 = Dense(len(data[label[0]].unique()), activation='softmax')
            self.dropout1 = Dropout(0.4)

        def call(self, inputs, training=False):
            x = self.dense1(inputs)
            if(training==False):
                x = self.dropout1(x, training=training)
            #x = self.dense2(x)
            return self.dense3(x)


    myCategoricalRegressorModel = myCategoricalRegressor()

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)

    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', 
                                                     factor=0.001,
                                                     patience=5, 
                                                     min_lr=1e-10)
    
    myCategoricalRegressorModel.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
                                        loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
                                        metrics=['accuracy'])

    
    data.dropna(inplace=True)
    features = data[features].to_numpy()
    label = data[label].to_numpy()
    print('fitting categorical regressor------')
    history = myCategoricalRegressorModel.fit(features, label, epochs= 100, callbacks=[early_stopping])
    return myCategoricalRegressorModel

def train_model(features, label, group='', subgroup=''):
    class myModel(tf.keras.Model):
        def __init__(self):
            super().__init__()        
            self.dense1 = Dense(32, activation='relu')  
            self.dense2 = Dense(16, activation='relu')         
            self.dense3 = Dense(1, activation='sigmoid')
            self.dropout1 = Dropout(0.4)

        def call(self, inputs, training=False):
            x = self.dense1(inputs)
            if(training==False):
                x = self.dropout1(x, training=training)
            x = self.dense2(x)        
            return self.dense3(x)

    model = myModel()
    data = load_data(BASE_DIR_OUTPUT, 'train.csv')
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)

    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
    
    if(group is not '' and subgroup is not ''):
        data = data[data[group]==subgroup]    
        
    features = data[features].to_numpy()
    labels = data[label].to_numpy()
    display(features)
    

    history = model.fit(features, labels, epochs= 200, callbacks=[early_stopping])
    model.save(os.path.join(BASE_DIR_OUTPUT, 'spaceship_model'), save_format='tf')

def make_predictions():
    df = load_data(BASE_DIR_OUTPUT, 'test.csv')
    model = load_model(os.path.join(BASE_DIR_OUTPUT, 'spaceship_model'))
    predictions = model.predict(df.to_numpy())
    predictions = np.round(predictions)
    predictions = [True if pred == 1 else False for pred in predictions]
    
    passenger_id = np.array(df.index, ndmin=2)
    preds = np.array(predictions, ndmin=2)
    preds = np.concatenate((passenger_id.T, preds.T), axis=1)
    df = pd.DataFrame(preds)
    predictions_file = os.path.join(BASE_DIR_OUTPUT, 'predictions.csv')
    
    if(os.path.exists(predictions_file)):
        os.remove(predictions_file)
            
    df.to_csv(path_or_buf=predictions_file, header=['PassengerId', 'Transported'], index=False)
    
def run_preprocess_pipeline(input_name, out_name, test=False):
    
    df = load_data(BASE_DIR, input_name)    
    
    df = processCabin(df)
    df = process_normalize_columns(df, numerical_columns)
    regressor = createRegressor(df, ['FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], ['RoomService'])
    df = fillAmenityAmount(df, amenityColumns, regressor)
    df = process_categorical(df, categorical_columns)
    regressor = createCategoricalRegressor(df, ['Cabin'], ['HomePlanet'])
    df = fillHomePlanet(df, regressor)

    df.fillna(method='ffill', inplace=True)

    
    df = proces_boolean(df, boolean_columns)
    if(not test):
        df = proces_boolean(df, label_column)
    for col in numerical_columns:
        df[col] = pd.to_numeric(df[col])   

    df = process_drop_columns(df, drop_columns) 
    df.to_csv(out_name)
    return df
    
def load_data(root_dir, filename):
    data = pd.read_csv(os.path.join(root_dir, filename), index_col=0, sep=',')    
    return data


def run_pipeline(in_name, out_name, test=False, force=False): 
    if(not os.path.exists(out_name) or force == True):
        run_preprocess_pipeline(in_name, out_name, test)
               
    
    

# Dataset

In [31]:
import warnings
warnings.filterwarnings("ignore")
BASE_DIR = '/kaggle/input/spaceship-titanic'
train_df = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'), index_col=0)
test_df = pd.read_csv(os.path.join(BASE_DIR, 'test.csv'), index_col=0)
display(train_df)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# Data Analysis

In [49]:
df = train_df.drop('Transported', axis=1)
#displayFacetStatistics(df)
display(len(df.index))
displayFacetsDive(df)

8693

# Preprocess Data

In [110]:
features=['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Destination', 'HomePlanet', 'Cabin','CryoSleep']
label_column = ['Transported']
categorical_columns = ['Destination', 'HomePlanet', 'Cabin']
boolean_columns = ['CryoSleep']
drop_columns=['Name' ,'Hash', 'VIP']
numerical_columns=['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
amenityColumns=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']    

train_file = os.path.join(BASE_DIR, 'train.csv')
train_file_processed = os.path.join(BASE_DIR_OUTPUT, 'train.csv')
test_file = os.path.join(BASE_DIR, 'test.csv')
test_file_processed = os.path.join(BASE_DIR_OUTPUT, 'test.csv')

run_pipeline(train_file, train_file_processed, test=False, force=False)
run_pipeline(test_file, test_file_processed, test=True, force=False)

train_model(features, ['Transported'])
#make_predictions()          

array([[4.93670886e-01, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 2.00000000e+00, 0.00000000e+00],
       [3.03797468e-01, 7.60801284e-03, 3.01881729e-04, ...,
        0.00000000e+00, 1.10000000e+01, 0.00000000e+00],
       [7.34177215e-01, 3.00132617e-03, 1.19947674e-01, ...,
        1.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [3.29113924e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 1.30000000e+01, 0.00000000e+00],
       [4.05063291e-01, 0.00000000e+00, 3.51859927e-02, ...,
        1.00000000e+00, 9.00000000e+00, 0.00000000e+00],
       [5.56962025e-01, 8.79458365e-03, 1.57246839e-01, ...,
        1.00000000e+00, 9.00000000e+00, 0.00000000e+00]])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
import keras
from matplotlib import pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [74]:
df = train_df[features]
display(train_df[features][train_df[features]['HomePlanet'] == 'Earth'])

Unnamed: 0_level_0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Destination,HomePlanet,Cabin,CryoSleep
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0002_01,24.0,109.0,9.0,25.0,549.0,44.0,TRAPPIST-1e,Earth,F/0/S,False
0004_01,16.0,303.0,70.0,151.0,565.0,2.0,TRAPPIST-1e,Earth,F/1/S,False
0005_01,44.0,0.0,483.0,0.0,291.0,0.0,PSO J318.5-22,Earth,F/0/P,False
0006_01,26.0,42.0,1539.0,3.0,0.0,0.0,TRAPPIST-1e,Earth,F/2/S,False
0006_02,28.0,0.0,0.0,0.0,0.0,,TRAPPIST-1e,Earth,G/0/S,True
...,...,...,...,...,...,...,...,...,...,...
9270_01,33.0,0.0,0.0,0.0,0.0,0.0,55 Cancri e,Earth,G/1497/S,True
9272_01,26.0,240.0,242.0,510.0,0.0,0.0,TRAPPIST-1e,Earth,G/1507/P,False
9272_02,21.0,86.0,3.0,149.0,208.0,329.0,TRAPPIST-1e,Earth,F/1894/P,False
9278_01,18.0,0.0,0.0,0.0,0.0,0.0,PSO J318.5-22,Earth,G/1499/S,True


In [75]:
def pandas_to_numpy(data):
    '''Convert a pandas DataFrame into a Numpy array'''
  # Drop empty rows.
    data = data.dropna(how="any", axis=0)
  # Separate DataFrame into two Numpy arrays.
    labels = np.array(data['Transported'])
    features = data.drop('Transported', axis=1)
    features = {name:np.array(value) for name, value in features.items()}
  
    return features, labels

#@title Visualize Binary Confusion Matrix and Compute Evaluation Metrics Per Subgroup
CATEGORY  =  "HomePlanet" #@param {type:"string"}
SUBGROUP =  1 #@param {type:"int"}

# Labels for annotating axes in plot.
classes = ['Transported', 'Not Transported']

# Given define subgroup, generate predictions and obtain its corresponding 
# ground truth.
subgroup_filter  = train_df.loc[train_df[CATEGORY] == SUBGROUP]
features, labels = pandas_to_numpy(subgroup_filter)

subgroup_results = model.evaluate(x=features, y=labels, verbose=0)
display(subgroup_filter)
confusion_matrix = np.array([[subgroup_results[1], subgroup_results[4]], 
                             [subgroup_results[2], subgroup_results[3]]])

subgroup_performance_metrics = {
    'ACCURACY': subgroup_results[5],
    'PRECISION': subgroup_results[6], 
    'RECALL': subgroup_results[7],
    'AUC': subgroup_results[8]
}
performance_df = pd.DataFrame(subgroup_performance_metrics, index=[SUBGROUP])
pd.options.display.float_format = '{:,.4f}'.format

plot_confusion_matrix(confusion_matrix, classes, SUBGROUP);
performance_df

NameError: name 'model' is not defined