# BERT for Disaster Text Problem
_By Nick Brooks_

Piggiebacking off of [xhulu's work](https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub) 

**References:** <br>
- Source for `bert_encode` function: https://www.kaggle.com/user123454321/bert-starter-inference
- All pre-trained BERT models from Tensorflow Hub: https://tfhub.dev/s?q=bert
- TF Hub Documentation for Bert Model: https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1

In [None]:
# We will use the official tokenization script created by the Google team
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from tensorflow.keras import callbacks
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import nltk
from scipy.sparse import hstack, csr_matrix

import tokenization

# Helper Functions

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

def text_processing(df):
    df['keyword'] = df['keyword'].str.replace("%20", " ")
    df['hashtags'] = df['text'].apply(lambda x: " ".join(re.findall(r"#(\w+)", x)))
    df['hash_loc_key'] = df[['hashtags', 'location','keyword']].astype(str).apply(lambda x: " ".join(x), axis=1)
    df['hash_loc_key'] = df["hash_loc_key"].astype(str).str.lower().str.strip().fillna('nan')
    
    textfeats = ['hash_loc_key', 'text']
    for cols in textfeats:
        df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
        df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
        df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words
        if cols == "text":
            df[cols+"_vader_Compound"]= df[cols].apply(lambda x:SIA.polarity_scores(x)['compound'])

    return df

def build_model(bert_layer, max_len=512, dropout=.2):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    numeric_inputs = Input(shape=(len(num_cols),), dtype=tf.float32, name="numeric_inputs")
    
    # Bert Layer
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    # Sequence Output
    sequence_output = SpatialDropout1D(dropout)(sequence_output)
    sequence_output = Bidirectional(LSTM(128, return_sequences=True))(sequence_output)
    sequence_output = GlobalAveragePooling1D()(sequence_output)
    
    # Pooled Output
    pooled_output = Dense(36, activation='relu')(pooled_output)
    
    # Dense Inputs
    numeric_x = Dense(512, activation='relu')(numeric_inputs)
    numeric_x = Dropout(dropout)(numeric_x)
    numeric_x = Dense(64, activation='relu')(numeric_x)
    
    # Concatenate
    cat = concatenate([
        pooled_output,
        sequence_output,
        numeric_x
    ])
    cat = Dropout(dropout)(cat)
    
    # Output Layer
    out = Dense(1, activation='sigmoid')(cat)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids, numeric_inputs], outputs=out)
    model.compile(Adam(lr=1e-6), loss='binary_crossentropy', metrics=['acc'])
    
    return model

# Load and Preprocess

- Load BERT from the Tensorflow Hub
- Load CSV files containing training data
- Load tokenizer from the bert layer
- Encode the text into tokens, masks, and segment flags

In [None]:
MAX_LEN = 36
BATCH_SIZE = 36
EPOCHS = 12

In [None]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

length_info = [len(x) for x in np.concatenate([train.text.values, test.text.values])]
print("Train Sequence Length - Mean {:.1f} +/- {:.1f}, Max {:.1f}, Min {:.1f}".format(
    np.mean(length_info), np.std(length_info), np.max(length_info), np.min(length_info)))

In [None]:
# Text Processing
SIA = SentimentIntensityAnalyzer()
train_df = text_processing(train)
test_df = text_processing(test)

# TF-IDF
count_vectorizer = TfidfVectorizer(
    analyzer="word",
    tokenizer=nltk.word_tokenize,
    preprocessor=None,
    stop_words='english',
    sublinear_tf=True,
    ngram_range=(1, 1),
    max_features=500)    

hash_loc_tfidf = count_vectorizer.fit(train_df['hash_loc_key'])
tfvocab = hash_loc_tfidf.get_feature_names()
print("Number of TF-IDF Features: {}".format(len(tfvocab)))

train_tfidf = count_vectorizer.transform(train_df['hash_loc_key'])
test_tfidf = count_vectorizer.transform(test_df['hash_loc_key'])

# Sparse Stack Numerical and TFIDF
dense_vars = [
    'hash_loc_key_num_words',
    'hash_loc_key_num_unique_words',
    'hash_loc_key_words_vs_unique',
    'text_num_words',
    'text_num_unique_words',
    'text_words_vs_unique',
    'text_vader_Compound']

# Normalisation - Standard Scaler
for d_i in dense_vars:
    scaler = StandardScaler()
    scaler.fit(train_df.loc[:,d_i].values.reshape(-1, 1))
    train_df.loc[:,d_i] = scaler.transform(train_df.loc[:,d_i].values.reshape(-1, 1))
    test_df.loc[:,d_i] = scaler.transform(test_df.loc[:,d_i].values.reshape(-1, 1))
    
# Sparse Stack
train_num = hstack([csr_matrix(train_df.loc[:,dense_vars].values),train_tfidf]).toarray()
test_num = hstack([csr_matrix(test_df.loc[:,dense_vars].values),test_tfidf]).toarray()
num_cols = train_df[dense_vars].columns.tolist() + tfvocab

In [None]:
# Bert Pre-Processing
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

train_input = (*bert_encode(train.text.values, tokenizer, max_len=MAX_LEN), train_num)
test_input = (*bert_encode(test.text.values, tokenizer, max_len=MAX_LEN), test_num)
train_labels = train.target.values

# Model: Build, Train, Predict, Submit

In [None]:
model = build_model(bert_layer, max_len=MAX_LEN)
model.summary()

In [None]:
es = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=4, verbose=1,
                             mode='min', baseline=None, restore_best_weights=False)
rlr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-7,
                                  mode='min', verbose=1)
checkpoint_1 = tf.keras.callbacks.ModelCheckpoint('model_loss.h5', monitor='val_loss', save_best_only=True)
# checkpoint_2 = tf.keras.callbacks.ModelCheckpoint('model_acc.h5', monitor='val_acc', save_best_only=True)


history = model.fit(
    train_input,
    train_labels,
    validation_split=0.2,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint_1,  rlr]
)

plot_metrics = ['loss','acc']
i = "None"
f, ax = plt.subplots(1,len(plot_metrics),figsize = [12,4])
for p_i,metric in enumerate(plot_metrics):
    ax[p_i].plot(history.history[metric], label='Train ' + metric)
    ax[p_i].plot(history.history['val_' + metric], label='Val ' + metric)
    ax[p_i].set_title("{} Fold Loss Curve - {}".format(i, metric))
    ax[p_i].legend()
plt.show()

In [None]:
model.load_weights('model_loss.h5')
test_pred = model.predict(test_input, batch_size=BATCH_SIZE)

In [None]:
submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission.csv', index=False)
submission.head()