In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.data import Dataset
import matplotlib.pyplot as plt
# pip install seaborn
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, Normalizer
import re
import math

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
%config InlineBackend.figure_format = 'retina'
pd.set_option("display.precision", 2)

# For facets
from IPython.core.display import display, HTML
import base64
!pip install facets-overview
from facets_overview.feature_statistics_generator import FeatureStatisticsGenerator

BASE_DIR = '/kaggle/input/spaceship-titanic'


/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv
Collecting facets-overview
  Downloading facets_overview-1.0.0-py2.py3-none-any.whl (24 kB)
Installing collected packages: facets-overview
Successfully installed facets-overview-1.0.0
[0m

In [28]:
#@title Define Function to Visualize Binary Confusion Matrix
def plot_confusion_matrix(
    confusion_matrix, class_names, subgroup, figsize = (8,6)):
  # We're taking our calculated binary confusion matrix that's already in the 
  # form of an array and turning it into a pandas DataFrame because it's a lot 
  # easier to work with a pandas DataFrame when visualizing a heat map in 
  # Seaborn.
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )

    rcParams.update({
      'font.family':'sans-serif',
      'font.sans-serif':['Liberation Sans'],
    })
  
    sns.set_context("notebook", font_scale=1.25)

    fig = plt.figure(figsize=figsize)

    plt.title('Confusion Matrix for Performance Across ' + subgroup)

    # Combine the instance (numercial value) with its description
    strings = np.asarray([['True Positives', 'False Negatives'],
                          ['False Positives', 'True Negatives']])
    labels = (np.asarray(
        ["{0:g}\n{1}".format(value, string) for string, value in zip(
            strings.flatten(), confusion_matrix.flatten())])).reshape(2, 2)

    heatmap = sns.heatmap(df_cm, annot=labels, fmt="", 
        linewidths=2.0, cmap=sns.color_palette("GnBu_d"));
    heatmap.yaxis.set_ticklabels(
        heatmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    heatmap.xaxis.set_ticklabels(
        heatmap.xaxis.get_ticklabels(), rotation=45, ha='right')
    plt.ylabel('References')
    plt.xlabel('Predictions')
    return fig

def process_drop_columns(df, columns):    
    new_df = df.drop(columns=columns, axis=1, errors='ignore')
    return new_df

def proces_boolean(df, columns):
    for col in columns:
        df[col] = [1 if x is True else 0 for x in df[col]]
    return df

def process_categorical(df, columns):         
    le = LabelEncoder()
    label_object = {}
    for col in columns:
        labelencoder = LabelEncoder()
        labelencoder.fit(df[col])
        df[col] = labelencoder.fit_transform(df[col])
        label_object[col] = labelencoder
    return df

def create_hash(df):
    df['Hash'] = df['Surname'].apply(lambda x: hash(tuple(x)))
    return df

def processCabin(df):
    df['Cabin'].replace('\/\d+\/', '/', regex=True, inplace=True)
    return df

def processName(df):
    df['Surname'] = df['Name'].apply(lambda item: item.split(" ")[1])
    return df

def createTrainEvalSets(df):
    df1 = df[np.mod(np.abs(df['Name'].apply(lambda item: hash(item.split(" ")[1]))), 4) < 3]
    df2 = df[np.mod(np.abs(df['Name'].apply(lambda item: hash(item.split(" ")[1]))), 4) >= 3]
    return (df1, df2)
# train_df = processCabin(train_df)

#slower
def fillAmenityAmount2(df, columns):    
    for col in df.T.columns:
        df.T[col].fillna(9, inplace=True)
    return df  
    
    
def displayRowsWithNulls(df, columns):
    display(df[columns][df[columns].isnull().any(axis=1)])

def process_normalize_columns(df, columns):
    for col in columns:
        df[col]=(df[col]-df[col].min())/(df[col].max()-df[col].min())
    return df

def fillAmenityAmount(df, columns, regressor):   
    df1=df[df[columns].isnull().any(axis=1)][columns]    
    for index, row in df1.iterrows():              
        features = df1.loc[index, ~pd.isna(row)]
        
        if(len(features) < 4):
            arr = features.to_numpy()
            arr = np.append(arr, 0)
            features = pd.Series(arr).to_numpy()
        features = np.array(features[..., np.newaxis])     
        
        prediction = regressor.predict(features.T)
        col_name = df1.columns[pd.isna(row)][0]
        
        df.loc[index, col_name] = prediction[0]   
    
    return df

def fillHomePlanet(df, regressor):    
    df1=df[df['HomePlanet'].isna()][['Cabin', 'HomePlanet']]
    for index, row in df1.iterrows():   
        
        features = np.array(df1.loc[index, 'Cabin']) 
        features = features[..., np.newaxis]
        
        prediction = regressor.predict(features)
        
        df.loc[index, 'HomePlanet'] = np.argmax(prediction[0])   
    
    return df

def displayFacetStatistics(df):
    fsg = FeatureStatisticsGenerator()
    dataframes = [
        {'table': df, 'name': 'trainData'}]
    spaceshipProto = fsg.ProtoFromDataFrames(dataframes)
    protostr = base64.b64encode(spaceshipProto.SerializeToString()).decode("utf-8")


    HTML_TEMPLATE = """<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
            <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
            <facets-overview id="elem"></facets-overview>
            <script>
              document.querySelector("#elem").protoInput = "{protostr}";
            </script>"""
    html = HTML_TEMPLATE.format(protostr=protostr)
    display(HTML(html))
    
def displayFacetsDive(df):
    SAMPLE_SIZE = 5000 #@param
    df.dropna(how="any", axis=0, inplace=True)
    train_dive = train_df.sample(SAMPLE_SIZE).to_json(orient='records')
    HTML_TEMPLATE = """<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
            <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
            <facets-dive id="elem" height="600"></facets-dive>
            <script>
              var data = {jsonstr};
              document.querySelector("#elem").data = data;
            </script>"""
    html = HTML_TEMPLATE.format(jsonstr=train_dive)
    display(HTML(html))
    
def createRegressor(data, features, label):    
    class myRegressor(tf.keras.Model):
        def __init__(self):
            super().__init__()        
            self.dense1 = Dense(1, activation='relu')  
            #self.dense2 = Dense(2, activation='relu')         
            self.dense3 = Dense(1)
            self.dropout1 = Dropout(0.4)

        def call(self, inputs, training=False):
            x = self.dense1(inputs)
            if(training==False):
                x = self.dropout1(x, training=training)
            #x = self.dense2(x)
            return self.dense3(x)
        
    myRegressorModel = myRegressor()
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)

    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', 
                                                     factor=0.001,
                                                     patience=5, 
                                                     min_lr=1e-10)

    myRegressorModel.compile(optimizer=tf.keras.optimizers.Adam(), 
                             loss=tf.keras.losses.MeanSquaredError(), 
                             metrics=['accuracy'])

    
    data.dropna(inplace=True)
    features = data[features]
    labels = data[label]
    print('fitting regressor------')
    history = myRegressorModel.fit(features, labels, epochs=100, callbacks=[early_stopping])
    
    return myRegressorModel

def createCategoricalRegressor(data, features, label):
    class myCategoricalRegressor(tf.keras.Model):
        def __init__(self):
            super().__init__()        
            self.dense1 = Dense(128, activation='relu')  
            self.dense2 = Dense(64, activation='relu')         
            self.dense3 = Dense(len(data[label[0]].unique()), activation='softmax')
            self.dropout1 = Dropout(0.4)

        def call(self, inputs, training=False):
            x = self.dense1(inputs)
            if(training==False):
                x = self.dropout1(x, training=training)
            #x = self.dense2(x)
            return self.dense3(x)


    myCategoricalRegressorModel = myCategoricalRegressor()

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)

    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', 
                                                     factor=0.001,
                                                     patience=5, 
                                                     min_lr=1e-10)
    
    myCategoricalRegressorModel.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
                                        loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
                                        metrics=['accuracy'])

    
    data.dropna(inplace=True)
    features = data[features].to_numpy()
    label = data[label].to_numpy()
    print('fitting categorical regressor------')
    history = myCategoricalRegressorModel.fit(features, label, epochs= 100, callbacks=[early_stopping])
    return myCategoricalRegressorModel

In [29]:
features=['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Destination', 'HomePlanet', 'Cabin','CryoSleep', 'VIP']
label_column = ['Transported']
categorical_columns = ['Destination', 'HomePlanet', 'Cabin']
boolean_columns = ['CryoSleep', 'VIP']
drop_columns=['Name' ,'Hash']
numerical_columns=['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
amenityColumns=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

def pipeline(df, test=False):
    df = processCabin(df)
    df = process_normalize_columns(df, numerical_columns)
    regressor = createRegressor(df, ['FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], ['RoomService'])
    df = fillAmenityAmount(df, amenityColumns, regressor)
    df = process_categorical(df, categorical_columns)
    regressor = createCategoricalRegressor(df, ['Cabin'], ['HomePlanet'])
    df = fillHomePlanet(df, regressor)

    df.fillna(method='ffill', inplace=True)

    
    df = proces_boolean(df, boolean_columns)
    if(not test):
        df = proces_boolean(df, label_column)
    for col in numerical_columns:
        df[col] = pd.to_numeric(df[col])   

    df = process_drop_columns(df, drop_columns) 
    
    return df

### Display dataset

In [20]:
import warnings
warnings.filterwarnings("ignore")
BASE_DIR = '/kaggle/input/spaceship-titanic'
train_df = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'), index_col=0, na_values="?")
test_df = pd.read_csv(os.path.join(BASE_DIR, 'test.csv'), index_col=0)
display(train_df)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


### Preprocess Data

In [30]:
import warnings
warnings.filterwarnings("ignore")

train_df = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'), index_col=0)
test_df = pd.read_csv(os.path.join(BASE_DIR, 'test.csv'), index_col=0)

train_df = pipeline(train_df)
test_df = pipeline(test_df, True)

fitting regressor------
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
fitting categorical regressor------
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
fitting regressor------
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
fitting categorical regressor------
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


In [None]:
class myModel(tf.keras.Model):
    def __init__(self):
        super().__init__()        
        self.dense1 = Dense(32, activation='relu')  
        self.dense2 = Dense(16, activation='relu')         
        self.dense3 = Dense(1, activation='sigmoid')
        self.dropout1 = Dropout(0.4)
        
    def call(self, inputs, training=False):
        x = self.dense1(inputs)
        if(training==False):
            x = self.dropout1(x, training=training)
        x = self.dense2(x)        
        return self.dense3(x)
    
model = myModel()

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=50)

model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
display(train_df.info())
features = train_df[train_df.columns[:-1]].to_numpy()
labels = train_df['Transported'].to_numpy()

history = model.fit(features, labels, epochs= 1000, callbacks=[early_stopping])
print(history)

<class 'pandas.core.frame.DataFrame'>
Index: 6606 entries, 0001_01 to 9280_02
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    6606 non-null   int64  
 1   CryoSleep     6606 non-null   int64  
 2   Cabin         6606 non-null   int64  
 3   Destination   6606 non-null   int64  
 4   Age           6606 non-null   float64
 5   VIP           6606 non-null   int64  
 6   RoomService   6606 non-null   float64
 7   FoodCourt     6606 non-null   float64
 8   ShoppingMall  6606 non-null   float64
 9   Spa           6606 non-null   float64
 10  VRDeck        6606 non-null   float64
 11  Transported   6606 non-null   int64  
dtypes: float64(6), int64(6)
memory usage: 670.9+ KB


None

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [None]:
# pd.crosstab(train_df['Cabin'], train_df['Transported']).plot.bar()
df = processCabin(train_df)
pd.crosstab(df['HomePlanet'], df['Cabin']).plot.bar()
pd.crosstab(df['Cabin'], df['HomePlanet']).plot.bar()

In [None]:
predictions = model.predict(test_df.to_numpy())
predictions = np.round(predictions)
predictions = [True if pred == 1 else False for pred in predictions]

In [None]:
passenger_id = np.array(test_df.index, ndmin=2)
preds = np.array(predictions, ndmin=2)
preds = np.concatenate((passenger_id.T, preds.T), axis=1)
df = pd.DataFrame(preds)
display(df)

df.to_csv(path_or_buf="/kaggle/working/predictions.csv", header=['PassengerId', 'Transported'], index=False)

In [None]:
os.remove("/kaggle/working/predictions.csv")

In [None]:
import keras
from matplotlib import pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
def fillHomePlanet(df):
    
    df1=df[df['HomePlanet'].isna()][['Cabin', 'HomePlanet']]
    for index, row in df1.iterrows():   
        
        features = np.array(df1.loc[index, 'Cabin']) 
        features = features[..., np.newaxis]
        
        prediction = myEstimatorModel2.predict(features)
        print(features , np.argmax(prediction[0]))
        
        df.loc[index, 'HomePlanet'] = np.argmax(prediction[0])   
    
    return df

df = process_categorical(train_df, ['Cabin'])
df = fillHomePlanet(train_df)
display(df)

In [None]:
df1 = pd.DataFrame({'key1': [1,2,3,4], 'key2':[5,6,7,8]})
df2 = pd.DataFrame({'key1': ['a','b','c', 'd'], 'key2':['e','f', 'g', 'h']})

print([df1, df2])


In [57]:
def pandas_to_numpy(data):
    '''Convert a pandas DataFrame into a Numpy array'''
  # Drop empty rows.
    data = data.dropna(how="any", axis=0)
  # Separate DataFrame into two Numpy arrays.
    labels = np.array(data['Transported'])
    features = data.drop('Transported', axis=1)
    features = {name:np.array(value) for name, value in features.items()}
  
    return features, labels

#@title Visualize Binary Confusion Matrix and Compute Evaluation Metrics Per Subgroup
CATEGORY  =  "HomePlanet" #@param {type:"string"}
SUBGROUP =  1 #@param {type:"int"}

# Labels for annotating axes in plot.
classes = ['Transported', 'Not Transported']

# Given define subgroup, generate predictions and obtain its corresponding 
# ground truth.
subgroup_filter  = train_df.loc[train_df[CATEGORY] == SUBGROUP]
features, labels = pandas_to_numpy(subgroup_filter)

subgroup_results = model.evaluate(x=features, y=labels, verbose=0)
display(subgroup_filter)
confusion_matrix = np.array([[subgroup_results[1], subgroup_results[4]], 
                             [subgroup_results[2], subgroup_results[3]]])

subgroup_performance_metrics = {
    'ACCURACY': subgroup_results[5],
    'PRECISION': subgroup_results[6], 
    'RECALL': subgroup_results[7],
    'AUC': subgroup_results[8]
}
performance_df = pd.DataFrame(subgroup_performance_metrics, index=[SUBGROUP])
pd.options.display.float_format = '{:,.4f}'.format

plot_confusion_matrix(confusion_matrix, classes, SUBGROUP);
performance_df

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type dict).