# Import

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, Input, Dropout
from tensorflow.keras.models import Model, load_model
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import KFold, train_test_split


# Define base directory

In [2]:
os.chdir("../")

# File List

In [3]:
print("Data List")
print(os.listdir("data"))

Data List
['description', 'sample_submission.csv', 'test.csv', 'train.csv']


# Load dataset

In [4]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

print(f"train shape:{train.shape}")
print(f"test shape:{test.shape}")
print(f"sample_submission shape:{sample_submission.shape}")

train shape:(8693, 14)
test shape:(4277, 13)
sample_submission shape:(4277, 2)


# Split with features and answer

In [5]:
train_ans = train["Transported"] * 1.0
train = train.drop("Transported", axis=1)

# => Preprocess

## Define data types

In [6]:
old_dtypes = train.dtypes

dtype_dict = {
    "PassengerId": "object",
    "HomePlanet": "category",
    "CryoSleep": "boolean",
    "Cabin": "category",
    "Destination": "category",
    "Age":"float",
    "VIP": "boolean",
    "RoomService": "float",
    "FoodCourt": "float",
    "ShoppingMall": "float",
    "Spa": "float",
    "VRDeck": "float",
    "Name": "category",    
}
train = train.astype(dtype_dict)
new_dtypes = train.dtypes
print("===============Changed=================")
for _index, _old, _new in zip(old_dtypes.index, old_dtypes, new_dtypes):
    print(f"column:<{_index}>  {_old}    -->    {_new}")

column:<PassengerId>  object    -->    object
column:<HomePlanet>  object    -->    category
column:<CryoSleep>  object    -->    boolean
column:<Cabin>  object    -->    category
column:<Destination>  object    -->    category
column:<Age>  float64    -->    float64
column:<VIP>  object    -->    boolean
column:<RoomService>  float64    -->    float64
column:<FoodCourt>  float64    -->    float64
column:<ShoppingMall>  float64    -->    float64
column:<Spa>  float64    -->    float64
column:<VRDeck>  float64    -->    float64
column:<Name>  object    -->    category


## Drop unused features

In [7]:
train = train.drop(["Name", "PassengerId"], axis=1)

## Split merged features

In [8]:
sub_df = train["Cabin"].str.split("\/", expand=True)
sub_df.columns = [
    "Cabin_A",
    "Cabin_B",
    "Cabin_C"
]
sub_df = sub_df.astype({
    "Cabin_A": "category",
    "Cabin_B": "float",
    "Cabin_C": "category"
})
train = pd.concat([train, sub_df], axis=1)
train = train.drop("Cabin", axis=1)

## Null padding

In [9]:
target_features = set(train.isnull().sum()[train.isnull().sum() > 0].index)
target_features &= set(train.dtypes[(train.dtypes == "float") | (train.dtypes == "boolean")].index)
target_features

{'Age',
 'Cabin_B',
 'CryoSleep',
 'FoodCourt',
 'RoomService',
 'ShoppingMall',
 'Spa',
 'VIP',
 'VRDeck'}

In [10]:
for column in target_features:
    null_colmun_name = column + "_NULL"
    train[null_colmun_name] = train[column].isna() * 1.0
    train[column] = train[column].fillna(0.0)
    # for boolean
    train[column] = train[column] * 1.0

## One-hot-encoding

In [11]:
# Only categorical features
encoders = {}
for column in train.columns:
    if train[column].dtype != "category":
        continue
    arr_data = np.array(train[column].values).reshape(-1, 1)
    encoder = OneHotEncoder().fit(arr_data)
    sub_df = pd.DataFrame(
        encoder.transform(arr_data).toarray(),
        columns = [f"{column}_{_category}" for _category in encoder.categories_[0]]
    )
    train = train.drop(column, axis=1)
    train = pd.concat([train, sub_df], axis=1)
    encoders[column] = encoder
with open("./model/encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)

## Normalization

In [12]:
def norm(srs):
    return (srs - srs.min()) / (srs.max() - srs.min())

for _column in train.columns:
    train[_column] = norm(train[_column])

## Function process

In [13]:
# class CustomModel(keras.Model):
#     def __init__(self, hidden_units):
#         super(CustomModel, self).__init__()
#         self.dense_layers = [keras.layers.Dense(u) for u in hidden_units]

#     def call(self, inputs):
#         x = inputs
#         for layer in self.dense_layers:
#             x = layer(x)
#         return x


In [77]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import tensorflow.keras.backend as K

def F1ScoreCustom(y_true, y_pred):
    y_pred = tf.cast((y_pred > 0.5), tf.float64) * 1.0
    return tf.constant(f1_score(y_true.numpy(), y_pred.numpy()), dtype=tf.float64)

class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='F1Score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.true_positives = self.add_weight(name='tp', initializer='zeros')
        self.true_negatives = self.add_weight(name='tn', initializer='zeros')
        self.false_positives = self.add_weight(name='fp', initializer='zeros')
        self.false_negatives = self.add_weight(name='fn', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.bool)
        y_pred = tf.cast(y_pred > 0.5, tf.bool)
#         print(tf.reduce_sum(tf.cast(y_pred, self.dtype)))
        tp = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
        tp = tf.reduce_sum(tf.cast(tp, self.dtype))
        
        tn = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, False))
        tn = tf.reduce_sum(tf.cast(tn, self.dtype))

        fp = tf.logical_and(tf.equal(y_true, False), tf.equal(y_pred, True))
        fp = tf.reduce_sum(tf.cast(fp, self.dtype))
        
        fn = tf.logical_and(tf.equal(y_true, False), tf.equal(y_pred, False))
        fn = tf.reduce_sum(tf.cast(fn, self.dtype))
        
#         if sample_weight is not None:
#             sample_weight = tf.cast(sample_weight, self.dtype)
#             sample_weight = tf.broadcast_to(sample_weight, values.shape)
#             values = tf.multiply(values, sample_weight)
        self.true_positives.assign_add(tf.cast(tp, self.dtype))
        self.true_negatives.assign_add(tf.cast(tn, self.dtype))
        self.false_positives.assign_add(tf.cast(fp, self.dtype))
        self.false_negatives.assign_add(tf.cast(fn, self.dtype))
    
    def result(self):
        self.recall = self.true_positives / (self.true_positives + self.true_negatives)
        self.precision = self.true_positives / (self.true_positives + self.false_positives)
        return tf.cast(2 * (self.recall * self.precision) / (self.recall + self.precision), self.dtype)
    
class F1Loss(tf.keras.losses.Loss):
    def __init__(self, **kwargs):
        super(F1Loss, self).__init__(**kwargs)
    def __call__(self, y_true, y_pred, sample_weight=None):
        y_pred_boolean = K.cast(y_pred > 0.5, K.floatx())
        tp = K.sum(K.cast(K.equal(y_true, 1) & K.equal(y_pred_boolean, 1), K.floatx()))
        tn = K.sum(K.cast(K.equal(y_true, 1) & K.equal(y_pred_boolean, 0), K.floatx()))
        fp = K.sum(K.cast(K.equal(y_true, 0) & K.equal(y_pred_boolean, 1), K.floatx()))
        fn = K.sum(K.cast(K.equal(y_true, 0) & K.equal(y_pred_boolean, 0), K.floatx()))
#         tp = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
#         tp = tf.reduce_sum(tf.cast(tp, tf.float64))
        
#         tn = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, False))
#         tn = tf.reduce_sum(tf.cast(tn, tf.float64))

#         fp = tf.logical_and(tf.equal(y_true, False), tf.equal(y_pred, True))
#         fp = tf.reduce_sum(tf.cast(fp, tf.float64))
        
#         fn = tf.logical_and(tf.equal(y_true, False), tf.equal(y_pred, False))
#         fn = tf.reduce_sum(tf.cast(fn, tf.float64))
        
        recall = tp / (tp + tn)
        precision = tp / (tp + fp)
        tensor = K.cast_to_floatx(2 * (recall * precision) / (recall + precision))
#         return tensor
        tensor2 = K.mean(K.square(K.cast_to_floatx(y_true) - K.cast_to_floatx(y_pred)))
        print(tensor, tensor2)
        return tensor2

class MSELoss(tf.keras.losses.Loss):
    def __init__(self, **kwargs):
        super(MSELoss, self).__init__(**kwargs)
    def __call__(self, y_true, y_pred, sample_weight=None):
        return K.mean(K.square(K.cast_to_floatx(y_true) - K.cast_to_floatx(y_pred)))

In [78]:
def process(
    model_name, 
    train_feature,
    train_target, 
    val_valfeature, 
    val_target
):
    # Stack layers
    inputs = Input(shape=(train_feature.shape[1]))
    layer = Dense(256, activation="relu")(inputs)
    layer = BatchNormalization()(layer)
    layer = Dropout(0.3)(layer)
    layer = Dense(256, activation="relu")(layer)
    layer = BatchNormalization()(layer)
    layer = Dropout(0.3)(layer)
    layer = Dense(128, activation="relu")(layer)
    layer = BatchNormalization()(layer)
    layer = Dropout(0.3)(layer)
    layer = Dense(1, activation="sigmoid")(layer)
    dl_model = Model(inputs=inputs, outputs=layer)
    
    # Compile
    dl_model.compile(
            optimizer="adam",
            loss=F1Loss(),
            metrics=[
                F1Score(), 
#                 tf.keras.metrics.BinaryAccuracy(threshold=0.5), 
#                 tf.keras.metrics.CategoricalAccuracy()
            ],
            loss_weights=None,
            sample_weight_mode=None,
            weighted_metrics=None,
            target_tensors=None,
            run_eagerly=True
    )
    
    # Define callbacks
    callbacks = [
            EarlyStopping(
                monitor="val_loss",
                patience=3
            ),
            ModelCheckpoint(
                filepath=f"./model/model_{model_name}.h5",
                vervose=1,
                save_best_only=True,
                save_weight_only=True
            )
        ]
    # Training
    dl_model.fit(
        train_feature.astype("float").values,
        train_target.astype("float").values,
        batch_size=256,
        epochs=30,
        verbose=1,
        callbacks=callbacks,
        validation_data=(val_valfeature.astype("float64").values, val_target.astype("float64").values),
        shuffle=True,
        class_weight=None,
        sample_weight=None,
        initial_epoch=0,
        steps_per_epoch=None,
        validation_steps=None
    )
    dl_model.load_weights(f"./model/model_{model_name}.h5")
    acc = dl_model.evaluate(val_valfeature.astype("float64").values, val_target.astype("float64").values)
    
    return acc[1], f1_score(dl_model.predict(val_valfeature.astype("float64").values) > 0.5, val_target.astype("float64").values)

In [79]:
kf = KFold(n_splits=10, random_state=91, shuffle=True)
results = {}
for index, (train_index, test_index) in enumerate(kf.split(train)):

    train_feature = train.loc[train_index].reset_index(drop=True)
    train_target = train_ans.loc[train_index].reset_index(drop=True)
    
    val_feature = train.loc[test_index].reset_index(drop=True)
    val_target = train_ans.loc[test_index].reset_index(drop=True)
    
    print("feature shape:", train_feature.shape[1])
    acc, f1 = process(
        index,
        train_feature,
        train_target, 
        val_feature, 
        val_target
    )
    results[index] = (acc, f1)

feature shape: 38
Epoch 1/30
tf.Tensor(0.55172414, shape=(), dtype=float32) tf.Tensor(0.29800245, shape=(), dtype=float32)
 1/31 [..............................] - ETA: 3s - loss: 0.2980 - F1Score: 0.5517tf.Tensor(0.5525292, shape=(), dtype=float32) tf.Tensor(0.31337154, shape=(), dtype=float32)
 2/31 [>.............................] - ETA: 2s - loss: 0.3057 - F1Score: 0.5521tf.Tensor(0.6268657, shape=(), dtype=float32) tf.Tensor(0.26824185, shape=(), dtype=float32)
 3/31 [=>............................] - ETA: 2s - loss: 0.2932 - F1Score: 0.5776tf.Tensor(0.64197534, shape=(), dtype=float32) tf.Tensor(0.2381374, shape=(), dtype=float32)
 4/31 [==>...........................] - ETA: 1s - loss: 0.2794 - F1Score: 0.5928tf.Tensor(0.65637064, shape=(), dtype=float32) tf.Tensor(0.22405893, shape=(), dtype=float32)
 5/31 [===>..........................] - ETA: 1s - loss: 0.2684 - F1Score: 0.6056tf.Tensor(0.7035573, shape=(), dtype=float32) tf.Tensor(0.22852345, shape=(), dtype=float32)
 6/31 


KeyboardInterrupt



In [None]:
sorted(results.items(), key=lambda x:x[1], reverse=True)

In [None]:
# from lightgbm import LGBMClassifier
# import lazypredict
# from lazypredict.Supervised import LazyClassifier

In [None]:
# clf = LazyClassifier(verbose=0,
#                      ignore_warnings=True,
#                      custom_metric=None,
#                      predictions=False,
#                      random_state=12,
#                      classifiers='all')

# models, predictions = clf.fit(X_train , X_test , y_train , y_test)
# clear_output()