In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import matplotlib.pyplot as plt
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
import tokenization

In [None]:
#setting a seed for reproducability
SEED = 1002
def seed_everything(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed) 
    
seed_everything(SEED) 

In [None]:
#reading input data with pandas
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")
#submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train["Age"].fillna(train["Age"].median(), inplace=True)
train["Age"] = train["Age"].map(int)
train["Fare"].fillna(train["Fare"].median(), inplace=True)
train["Fare"] = train["Fare"].map(round).map(int)

test["Age"].fillna(test["Age"].median(), inplace=True)
test["Age"] = test["Age"].map(int)
test["Fare"].fillna(test["Fare"].median(), inplace=True)
test["Fare"] = test["Fare"].map(round).map(int)

In [None]:
train = train.drop_duplicates(keep='first')
train = train.apply(lambda x: x.fillna(" "))
test = test.apply(lambda x: x.fillna(" "))
train.info()

In [None]:
test.info()

In [None]:
fig, axs = plt.subplots(3, 3, sharey=False, tight_layout=True, squeeze=False, figsize=(15,15))
axs[0, 0].hist(train["Pclass"], bins=5)
axs[0, 1].hist(train["Sex"], bins=5)
axs[0, 2].boxplot(train["SibSp"])
axs[1, 0].boxplot(train["Parch"])
axs[1, 1].boxplot(train["Age"])
axs[1, 2].boxplot(train["Fare"])
axs[2, 0].hist(train["Embarked"], bins=10)

In [None]:
def take_first_element(text):
    return text[0]
print(train["Ticket"][5])
take_first_element(train["Ticket"][5])

In [None]:
def sibSp_number(x):
    if x <= 2: 
        x = "a"
    else:
        x = "b" 
    return x    
def parch_number(x):
    if x < 1:
        x = "a"
    else:
        x = "b"
    return x  

In [None]:
train["SibSp"] = train["SibSp"].apply(lambda x: sibSp_number(x))
test["SibSp"] = test["SibSp"].apply(lambda x: sibSp_number(x))

In [None]:
train["Parch"] = train["Parch"].apply(lambda x: parch_number(x))
test["Parch"] = test["Parch"].apply(lambda x: parch_number(x))

In [None]:
plt.hist(train["SibSp"], bins=5)

In [None]:
plt.hist(train["Parch"], bins=5)

In [None]:
train['Ticket'] = train['Ticket'].apply(lambda x: take_first_element(x))
test['Ticket'] = test['Ticket'].apply(lambda x: take_first_element(x))
train['Cabin'] = train['Cabin'].apply(lambda x: take_first_element(x))
test['Cabin'] = test['Cabin'].apply(lambda x: take_first_element(x))

In [None]:
train["Ticket"]

In [None]:
fig, axs = plt.subplots(1, 2, sharey=False, tight_layout=True, squeeze=False, figsize=(10, 4))
axs[0, 0].hist(train["Ticket"], bins=5)
axs[0, 1].hist(train["Cabin"], bins=5)

In [None]:
train["new"] = train["Sex"] + " " + train["Embarked"] + " " \
+ train["Name"] + " " + train["Age"].map(str) + " "  \
+ train["Pclass"].map(str) + " " + train["SibSp"].map(str) + " " \
+ train["Parch"].map(str) + " " + train["Ticket"].map(str) + " " \
+ train["Fare"].map(str) + " " + train["Cabin"].map(str)

test["new"] = test["Sex"] + " " + test["Embarked"] + " " \
+ test["Name"] + " " + test["Age"].map(str) + " "  \
+ test["Pclass"].map(str) + " " + test["SibSp"].map(str) + " " \
+ test["Parch"].map(str) + " " + test["Ticket"].map(str) + " " \
+ test["Fare"].map(str)  + " " + test["Cabin"].map(str)

In [None]:
def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

In [None]:
import string

In [None]:
def remove_digit(text):
    return ''.join([i for i in text if not i.isdigit()])

" ".join(remove_digit("112zobos12 12a323ss").split())

In [None]:
# train['new_clean'] = train['new'].apply(lambda x: remove_punct(x))
# train['new_clean'] = train['new_clean'].apply(lambda x: x.lower())
# # train['new_clean'] = train['new_clean'].apply(lambda m: remove_digit(m))
# # train['new_clean'] = train['new_clean'].apply(lambda m: " ".join(m.split()))

# test['new_clean'] = test['new'].apply(lambda x: remove_punct(x))
# test['new_clean'] = test['new_clean'].apply(lambda x: x.lower())
# # test['new_clean'] = test['new_clean'].apply(lambda m: remove_digit(m))
# # test['new_clean'] = test['new_clean'].apply(lambda m: " ".join(m.split()))

In [None]:
# train["new_clean"][20]

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    #could be pooled_output, sequence_output yet sequence output provides for each input token (in context)
    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    #clf_output = tf.keras.layers.Dropout(rate=0.5)(clf_output)
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    
    #specifying optimizer
    model.compile(Adam(learning_rate=3e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
print(min(train["new"].apply(len)))
max(train["new"].apply(len))

In [None]:
#load uncased bert model
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/2"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
max_len = max(train["new"].apply(len))
#vocab file from pre-trained BERT for tokenization
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()

#returns true/false depending on if we selected cased/uncased bert layer
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

#Create the tokenizer
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

#tokenizing the training and testing data
train_input = bert_encode(train.new.values, tokenizer, max_len=max_len)
test_input = bert_encode(test.new.values, tokenizer, max_len=max_len)
train_labels = train.Survived.values

In [None]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

In [None]:
#https://alexadam.ca/ml/2018/08/03/early-stopping.html
import keras.backend as K
from keras.callbacks import Callback
import numpy as np

class FixedEarlyStopping(Callback):
    """Stop training when a monitored quantity has stopped improving.
    # Arguments
        monitors: quantities to be monitored.
        min_deltas: minimum change in the monitored quantities
            to qualify as an improvement, i.e. an absolute
            change of less than min_delta, will count as no
            improvement.
        patience: number of epochs with no improvement
            after which training will be stopped.
        verbose: verbosity mode.
        modes: list of {auto, min, max}. In `min` mode,
            training will stop when the quantities
            monitored has stopped decreasing; in `max`
            mode it will stop when the quantity
            monitored has stopped increasing; in `auto`
            mode, the direction is automatically inferred
            from the name of the monitored quantity.
        baselines: Baseline values for the monitored quantities to reach.
            Training will stop if the model doesn't show improvement
            for at least one of the baselines.
    """

    def __init__(self,
                 monitors=['val_loss'],
                 min_deltas=[0],
                 patience=0,
                 verbose=0,
                 modes=['auto'],
                 baselines=[None]):
        super(FixedEarlyStopping, self).__init__()

        self.monitors = monitors
        self.baselines = baselines
        self.patience = patience
        self.verbose = verbose
        self.min_deltas = min_deltas
        self.wait = 0
        self.stopped_epoch = 0
        self.monitor_ops = []
        
        
        for i, mode in enumerate(modes):
            if mode not in ['auto', 'min', 'max']:
                warnings.warn('EarlyStopping mode %s is unknown, '
                              'fallback to auto mode.' % mode,
                              RuntimeWarning)
                modes[i] = 'auto'

        for i, mode in enumerate(modes):
            if mode == 'min':
                self.monitor_ops.append(np.less)
            elif mode == 'max':
                self.monitor_ops.append(np.greater)
            else:
                if 'acc' in self.monitors[i]:
                    self.monitor_ops.append(np.greater)
                else:
                    self.monitor_ops.append(np.less)

        for i, monitor_op in enumerate(self.monitor_ops):
            if monitor_op == np.greater:
                self.min_deltas[i] *= 1
            else:
                self.min_deltas[i] *= -1

    def on_train_begin(self, logs=None):
        # Allow instances to be re-used
        self.waits = []
        self.stopped_epoch = 0
        self.bests = []
        
        for i, baseline in enumerate(self.baselines):
            if baseline is not None:
                self.bests.append(baseline)
            else:
                self.bests.append(np.Inf if self.monitor_ops[i] == np.less else -np.Inf)
                
            self.waits.append(0)

    def on_epoch_end(self, epoch, logs=None):
        reset_all_waits = False
        for i, monitor in enumerate(self.monitors):
            current = logs.get(monitor)

            if current is None:
                warnings.warn(
                    'Early stopping conditioned on metric `%s` '
                    'which is not available. Available metrics are: %s' %
                    (monitor, ','.join(list(logs.keys()))), RuntimeWarning
                )
                return
            
            if self.monitor_ops[i](current - self.min_deltas[i], self.bests[i]):
                self.bests[i] = current
                self.waits[i] = 0
                reset_all_waits = True
            else:
                self.waits[i] += 1
        
        if reset_all_waits:
            for i in range(len(self.waits)):
                self.waits[i] = 0
            
            return
        
        num_sat = 0
        for wait in self.waits:
            if wait >= self.patience:
                num_sat += 1
                
        if num_sat == len(self.waits):
            self.stopped_epoch = epoch
            self.model.stop_training = True
        
        print(self.waits)

    def on_train_end(self, logs=None):
        if self.stopped_epoch > 0 and self.verbose > 0:
            print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))

In [None]:
checkpoint = FixedEarlyStopping()

In [None]:
#checkpoint = ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, patience=5)
#checkpoint = tf.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=2, verbose=1, min_delta=0.001, mode='min')

train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=7,
    callbacks=[checkpoint],
    batch_size=16
)


In [None]:
test_predict = model.predict(test_input)
test_result = np.array([1 if x >= 0.5 else 0 for x in test_predict])

In [None]:
test["Survived"] = test_result
csv_data = test[['PassengerId', 'Survived']]
csv_data.to_csv('bibibik5.csv', index=False)