# Import

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from tensorflow.keras.models import load_model

# Define base directory

In [2]:
os.chdir("../")

# File List

In [3]:
print("Data List")
print(os.listdir("data"))

Data List
['description', 'sample_submission.csv', 'test.csv', 'train.csv']


# Load dataset

In [23]:
test = pd.read_csv("data/test.csv")
print(f"test shape:{test.shape}")

test shape:(4277, 13)


In [25]:
submission = pd.read_csv("data/sample_submission.csv")
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


# Load Model dataset

In [5]:
with open("./model/encoders.pkl", "rb") as f:
    encoders = pickle.load(f)
dl_model = load_model("./model/model.h5")

## Define data types

In [6]:
old_dtypes = test.dtypes

dtype_dict = {
    "PassengerId": "object",
    "HomePlanet": "category",
    "CryoSleep": "boolean",
    "Cabin": "category",
    "Destination": "category",
    "Age":"float",
    "VIP": "boolean",
    "RoomService": "float",
    "FoodCourt": "float",
    "ShoppingMall": "float",
    "Spa": "float",
    "VRDeck": "float",
    "Name": "category",    
}
test = test.astype(dtype_dict)
new_dtypes = test.dtypes
print("===============Changed=================")
for _index, _old, _new in zip(old_dtypes.index, old_dtypes, new_dtypes):
    print(f"column:<{_index}>  {_old}    -->    {_new}")

column:<PassengerId>  object    -->    object
column:<HomePlanet>  object    -->    category
column:<CryoSleep>  object    -->    boolean
column:<Cabin>  object    -->    category
column:<Destination>  object    -->    category
column:<Age>  float64    -->    float64
column:<VIP>  object    -->    boolean
column:<RoomService>  float64    -->    float64
column:<FoodCourt>  float64    -->    float64
column:<ShoppingMall>  float64    -->    float64
column:<Spa>  float64    -->    float64
column:<VRDeck>  float64    -->    float64
column:<Name>  object    -->    category


## Drop unused features

In [7]:
test = test.drop(["Name", "PassengerId"], axis=1)

## Split merged features

In [8]:
sub_df = test["Cabin"].str.split("\/", expand=True)
sub_df.columns = [
    "Cabin_A",
    "Cabin_B",
    "Cabin_C"
]
sub_df = sub_df.astype({
    "Cabin_A": "category",
    "Cabin_B": "float",
    "Cabin_C": "category"
})
test = pd.concat([test, sub_df], axis=1)
test = test.drop("Cabin", axis=1)

## Null padding

In [9]:
target_features = set(test.isnull().sum()[test.isnull().sum() > 0].index)
target_features &= set(test.dtypes[(test.dtypes == "float") | (test.dtypes == "boolean")].index)
target_features

{'Age',
 'Cabin_B',
 'CryoSleep',
 'FoodCourt',
 'RoomService',
 'ShoppingMall',
 'Spa',
 'VIP',
 'VRDeck'}

In [10]:
for column in target_features:
    null_colmun_name = column + "_NULL"
    test[null_colmun_name] = test[column].isna() * 1.0
    test[column] = test[column].fillna(0.0)
    # for boolean
    test[column] = test[column] * 1.0

## One-hot-encoding

In [11]:
# Only categorical features
for column in test.columns:
    if test[column].dtype != "category":
        continue
    arr_data = np.array(test[column].values).reshape(-1, 1)
    encoder = encoders[column]
    sub_df = pd.DataFrame(
        encoder.transform(arr_data).toarray(),
        columns = [f"{column}_{_category}" for _category in encoder.categories_[0]]
    )
    test = test.drop(column, axis=1)
    test = pd.concat([test, sub_df], axis=1)

## Normalization

In [12]:
def norm(srs):
    return (srs - srs.min()) / (srs.max() - srs.min())

for _column in test.columns:
    test[_column] = norm(test[_column])

In [13]:
test

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_B,Spa_NULL,...,Cabin_A_C,Cabin_A_D,Cabin_A_E,Cabin_A_F,Cabin_A_G,Cabin_A_T,Cabin_A_nan,Cabin_C_P,Cabin_C_S,Cabin_C_nan
0,1.0,0.341772,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.001587,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.240506,0.0,0.000000,0.000356,0.00000,0.142260,0.000000,0.002116,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.392405,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.481013,0.0,0.000000,0.263206,0.00000,0.009121,0.026266,0.000529,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.253165,0.0,0.000865,0.000000,0.07658,0.000000,0.000000,0.002646,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1.0,0.430380,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.791534,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4273,0.0,0.531646,0.0,0.000000,0.033514,0.00205,0.000504,0.006466,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4274,1.0,0.000000,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.156614,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4275,0.0,0.000000,0.0,0.000000,0.106042,0.00000,0.000000,0.023482,0.157143,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [21]:
pred = dl_model.predict(test.astype("float"))
pred = pred > 0.5
pred = pred.flatten().tolist()



In [26]:
submission["Transported"] = pred
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [35]:
os.makedirs("./result", exist_ok=True)
submission.to_csv("./result/submission.csv", index=False)