# Import

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Define base directory

In [2]:
os.chdir("../")

# File List

In [3]:
print("Data List")
print(os.listdir("data"))

Data List
['description', 'sample_submission.csv', 'test.csv', 'train.csv']


# Load dataset

In [4]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

print(f"train shape:{train.shape}")
print(f"test shape:{test.shape}")
print(f"sample_submission shape:{sample_submission.shape}")

train shape:(8693, 14)
test shape:(4277, 13)
sample_submission shape:(4277, 2)


# Split with features and answer

In [5]:
train_ans = train["Transported"] * 1.0
train = train.drop("Transported", axis=1)

# => Preprocess

## Define data types

In [6]:
old_dtypes = train.dtypes

dtype_dict = {
    "PassengerId": "object",
    "HomePlanet": "category",
    "CryoSleep": "boolean",
    "Cabin": "category",
    "Destination": "category",
    "Age":"float",
    "VIP": "boolean",
    "RoomService": "float",
    "FoodCourt": "float",
    "ShoppingMall": "float",
    "Spa": "float",
    "VRDeck": "float",
    "Name": "category",    
}
train = train.astype(dtype_dict)
new_dtypes = train.dtypes
print("===============Changed=================")
for _index, _old, _new in zip(old_dtypes.index, old_dtypes, new_dtypes):
    print(f"column:<{_index}>  {_old}    -->    {_new}")

column:<PassengerId>  object    -->    object
column:<HomePlanet>  object    -->    category
column:<CryoSleep>  object    -->    boolean
column:<Cabin>  object    -->    category
column:<Destination>  object    -->    category
column:<Age>  float64    -->    float64
column:<VIP>  object    -->    boolean
column:<RoomService>  float64    -->    float64
column:<FoodCourt>  float64    -->    float64
column:<ShoppingMall>  float64    -->    float64
column:<Spa>  float64    -->    float64
column:<VRDeck>  float64    -->    float64
column:<Name>  object    -->    category


## Drop unused features

In [7]:
train = train.drop(["Name", "PassengerId"], axis=1)

## Split merged features

In [8]:
sub_df = train["Cabin"].str.split("\/", expand=True)
sub_df.columns = [
    "Cabin_A",
    "Cabin_B",
    "Cabin_C"
]
sub_df = sub_df.astype({
    "Cabin_A": "category",
    "Cabin_B": "float",
    "Cabin_C": "category"
})
train = pd.concat([train, sub_df], axis=1)
train = train.drop("Cabin", axis=1)

## Null padding

In [9]:
target_features = set(train.isnull().sum()[train.isnull().sum() > 0].index)
target_features &= set(train.dtypes[(train.dtypes == "float") | (train.dtypes == "boolean")].index)
target_features

{'Age',
 'Cabin_B',
 'CryoSleep',
 'FoodCourt',
 'RoomService',
 'ShoppingMall',
 'Spa',
 'VIP',
 'VRDeck'}

In [10]:
for column in target_features:
    null_colmun_name = column + "_NULL"
    train[null_colmun_name] = train[column].isna() * 1.0
    train[column] = train[column].fillna(0.0)
    # for boolean
    train[column] = train[column] * 1.0

## One-hot-encoding

In [11]:
# Only categorical features
encoders = {}
for column in train.columns:
    if train[column].dtype != "category":
        continue
    arr_data = np.array(train[column].values).reshape(-1, 1)
    encoder = OneHotEncoder().fit(arr_data)
    sub_df = pd.DataFrame(
        encoder.transform(arr_data).toarray(),
        columns = [f"{column}_{_category}" for _category in encoder.categories_[0]]
    )
    train = train.drop(column, axis=1)
    train = pd.concat([train, sub_df], axis=1)
    encoders[column] = encoder
with open("./model/encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)

## Normalization

In [12]:
def norm(srs):
    return (srs - srs.min()) / (srs.max() - srs.min())

for _column in train.columns:
    train[_column] = norm(train[_column])

## Stack Layers

In [13]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, Input, Dropout
from tensorflow.keras.models import Model

In [14]:
inputs = Input(shape=(train.shape[1]))
layer = Dense(512, activation="relu")(inputs)
layer = BatchNormalization()(layer)
layer = Dropout(0.2)(layer)
layer = Dense(512, activation="relu")(layer)
layer = BatchNormalization()(layer)
layer = Dropout(0.2)(layer)

layer = Dense(256, activation="relu")(layer)
layer = BatchNormalization()(layer)
layer = Dropout(0.2)(layer)

layer = Dense(1, activation="sigmoid")(layer)
dl_model = Model(inputs=inputs, outputs=layer)
dl_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 38)]              0         
                                                                 
 dense (Dense)               (None, 512)               19968     
                                                                 
 batch_normalization (BatchN  (None, 512)              2048      
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 512)               262656    
                                                                 
 batch_normalization_1 (Batc  (None, 512)              2048      
 hNormalization)                                             

## Compile model

In [15]:
dl_model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"],
        loss_weights=None,
        sample_weight_mode=None,
        weighted_metrics=None,
        target_tensors=None
    )

## Define callbacks

In [16]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [
        EarlyStopping(
            monitor="val_loss",
            patience=3
        ),
        ModelCheckpoint(
            filepath="./model/model.h5",
            vervose=1,
            save_best_only=True
        )
    ]

# Train

In [17]:
dl_model.fit(
    train.astype("float").values,
    train_ans.astype("float").values,
    batch_size=128,
    epochs=100,
    verbose=1,
    callbacks=callbacks,
    validation_split=0.2,
    shuffle=True,
    class_weight=None,
    sample_weight=None,
    initial_epoch=0,
    steps_per_epoch=None,
    validation_steps=None
)

Epoch 1/100
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/10

<keras.callbacks.History at 0x253940be388>