In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
import tokenization

In [None]:
#setting a seed for reproducability
SEED = 1002
def seed_everything(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed) 
    
seed_everything(SEED) 

In [None]:
#reading input data with pandas
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")
#submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [None]:
train.head()

In [None]:
train = train.drop_duplicates(keep='first')
train = train.apply(lambda x: x.fillna(""))
test = test.apply(lambda x: x.fillna(""))
train.info()

In [None]:
train["new"] = train["Sex"] + " " + train["Embarked"] + " " \
+ train["Name"] + " " + train["Age"].map(str) + " "  \
+ train["Pclass"].map(str) + " " + train["SibSp"].map(str) + " " \
+ train["Parch"].map(str) + " " + train["Ticket"].map(str) + " " \
+ train["Fare"].map(str) + " " + train["Cabin"].map(str)

test["new"] = test["Sex"] + " " + test["Embarked"] + " " \
+ test["Name"] + " " + test["Age"].map(str) + " "  \
+ test["Pclass"].map(str) + " " + test["SibSp"].map(str) + " " \
+ test["Parch"].map(str) + " " + test["Ticket"].map(str) + " " \
+ test["Fare"].map(str)  + " " + test["Cabin"].map(str)

In [None]:
train['new'][15]

In [None]:
def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

In [None]:
import string

In [None]:
def remove_digit(text):
    return ''.join([i for i in text if not i.isdigit()])

" ".join(remove_digit("112zobos12 12a323ss").split())

In [None]:
# train['new_clean'] = train['new'].apply(lambda x: remove_punct(x))
# train['new_clean'] = train['new_clean'].apply(lambda x: x.lower())
# # train['new_clean'] = train['new_clean'].apply(lambda m: remove_digit(m))
# # train['new_clean'] = train['new_clean'].apply(lambda m: " ".join(m.split()))

# test['new_clean'] = test['new'].apply(lambda x: remove_punct(x))
# test['new_clean'] = test['new_clean'].apply(lambda x: x.lower())
# # test['new_clean'] = test['new_clean'].apply(lambda m: remove_digit(m))
# # test['new_clean'] = test['new_clean'].apply(lambda m: " ".join(m.split()))

In [None]:
# train["new_clean"][20]

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    #could be pooled_output, sequence_output yet sequence output provides for each input token (in context)
    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    
    #specifying optimizer
    model.compile(Adam(learning_rate=6e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
print(min(train["new"].apply(len)))
max(train["new"].apply(len))

In [None]:
#load uncased bert model
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/2"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
max_len = max(train["new"].apply(len))
#vocab file from pre-trained BERT for tokenization
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()

#returns true/false depending on if we selected cased/uncased bert layer
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

#Create the tokenizer
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

#tokenizing the training and testing data
train_input = bert_encode(train.new.values, tokenizer, max_len=max_len)
test_input = bert_encode(test.new.values, tokenizer, max_len=max_len)
train_labels = train.Survived.values

In [None]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

In [None]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, patience=2)

train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=5,
    callbacks=[checkpoint],
    batch_size=10
)


In [None]:
test_predict = model.predict(test_input)
test_result = np.array([1 if x >= 0.5 else 0 for x in test_predict])

In [None]:
test["Survived"] = test_result
csv_data = test[['PassengerId', 'Survived']]
csv_data.to_csv('d.csv', index=False)