# Original notebook: https://www.kaggle.com/akensert/bert-base-tf2-0-now-huggingface-transformer (version 12)

In [None]:
import sys
#sys.path.insert(0, "../input/transformers/")

In [None]:
#!pip install transformers

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
#import tensorflow_hub as hub
import tensorflow as tf
#import bert_tokenization as tokenization
import tensorflow.keras.backend as K
import os
from scipy.stats import spearmanr
from math import floor, ceil
from transformers import *
import seaborn as sns

np.set_printoptions(suppress=True)
pd.set_option('display.max_columns', 500)
print(tf.__version__)

import random
random.seed(42)
np.random.seed(42)

#### 1. Read data and tokenizer

Read tokenizer and data, as well as defining the maximum sequence length that will be used for the input to Bert (maximum is usually 512 tokens)

In [None]:
PATH = '../input/google-quest-challenge/'

# BERT_PATH = '../input/bert-base-from-tfhub/bert_en_uncased_L-12_H-768_A-12'
# tokenizer = tokenization.FullTokenizer(BERT_PATH+'/assets/vocab.txt', True)

BERT_PATH = '../input/bert-base-uncased-huggingface-transformer/'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH+'bert-base-uncased-vocab.txt')

MAX_SEQUENCE_LENGTH = 384

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

In [None]:
def _convert_to_transformer_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for transformer (including bert)"""
    
    def return_id(str1, str2, truncation_strategy, length):

        inputs = tokenizer.encode_plus(str1, str2,
            add_special_tokens=True,
            max_length=length,
            truncation_strategy=truncation_strategy)
        
        input_ids =  inputs["input_ids"]
        input_masks = [1] * len(input_ids)
        input_segments = inputs["token_type_ids"]
        padding_length = length - len(input_ids)
        padding_id = tokenizer.pad_token_id
        input_ids = input_ids + ([padding_id] * padding_length)
        input_masks = input_masks + ([0] * padding_length)
        input_segments = input_segments + ([0] * padding_length)
        
        return [input_ids, input_masks, input_segments]
    
    input_ids_q, input_masks_q, input_segments_q = return_id(
        title + ' ' + question, None, 'longest_first', max_sequence_length)
    
    input_ids_a, input_masks_a, input_segments_a = return_id(
        answer, None, 'longest_first', max_sequence_length)
    
    return [input_ids_q, input_masks_q, input_segments_q,
            input_ids_a, input_masks_a, input_segments_a]

def compute_input_arrays(df, columns, tokenizer, max_sequence_length):
    input_ids_q, input_masks_q, input_segments_q = [], [], []
    input_ids_a, input_masks_a, input_segments_a = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        ids_q, masks_q, segments_q, ids_a, masks_a, segments_a = \
        _convert_to_transformer_inputs(t, q, a, tokenizer, max_sequence_length)
        
        input_ids_q.append(ids_q)
        input_masks_q.append(masks_q)
        input_segments_q.append(segments_q)

        input_ids_a.append(ids_a)
        input_masks_a.append(masks_a)
        input_segments_a.append(segments_a)
        
    return [np.asarray(input_ids_q, dtype=np.int32), 
            np.asarray(input_masks_q, dtype=np.int32), 
            np.asarray(input_segments_q, dtype=np.int32),
            np.asarray(input_ids_a, dtype=np.int32), 
            np.asarray(input_masks_a, dtype=np.int32), 
            np.asarray(input_segments_a, dtype=np.int32)]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [None]:
outputs = compute_output_arrays(df_train, output_categories)
inputs = compute_input_arrays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arrays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

# Additional features

In [None]:
concat_df = pd.concat([df_train, df_test])
concat_df['question_user==answer_user'] = (concat_df['question_user_name'] == concat_df['answer_user_name']).astype('int')
concat_df['title_length'] = concat_df['question_title'].apply(lambda x: len(x))
concat_df['question_length'] = concat_df['question_body'].apply(lambda x: len(x))
concat_df['answer_length'] = concat_df['answer'].apply(lambda x: len(x))
concat_df['title_split'] = concat_df['question_title'].str.split(' ')
concat_df['question_split'] = concat_df['question_body'].str.split(' ')
concat_df['answer_split'] = concat_df['answer'].str.split(' ')
concat_df['title_split_length'] = concat_df['title_split'].apply(lambda x: len(x))
concat_df['question_split_length'] = concat_df['question_split'].apply(lambda x: len(x))
concat_df['answer_split_length'] = concat_df['answer_split'].apply(lambda x: len(x))

In [None]:
num_unique_words = lambda x: pd.Series(x).nunique()
concat_df['title_num_unique_words'] = concat_df['title_split'].apply(num_unique_words)
concat_df['question_num_unique_words'] = concat_df['question_split'].apply(num_unique_words)
concat_df['answer_num_unique_words'] = concat_df['answer_split'].apply(num_unique_words)

In [None]:
concat_df['title_word_is_in_question'] = 0
concat_df['title_word_num_appearance_in_question'] = 0
concat_df['title_word_is_in_answer'] = 0
concat_df['title_word_num_appearance_in_answer'] = 0
concat_df['question_word_is_in_answer'] = 0
concat_df['question_word_num_appearance_in_answer'] = 0
for i, row in tqdm(concat_df[['title_split', 'question_split', 'answer_split']].iterrows(), position=0):
    title_row = row['title_split']
    question_row = row['question_split']
    answer_row = row['answer_split']
    title_question_count = 0
    title_answer_count = 0
    question_answer_count = 0
  
  # title - question & title - answer
    for word in title_row:
        if word in question_row:
            title_question_count += 1
        if word in answer_row:
            title_answer_count += 1
        # question - answer
        for word in question_row:
            if word in answer_row:
                question_answer_count += 1

  # calc 
    if title_question_count > 0:
        concat_df['title_word_is_in_question'][i] = 1
        concat_df['title_word_num_appearance_in_question'][i] = title_question_count
    if title_answer_count > 0:
        concat_df['title_word_is_in_answer'][i] = 1
        concat_df['title_word_num_appearance_in_answer'][i] = title_answer_count
    if question_answer_count > 0:
        concat_df['question_word_is_in_answer'][i] = 1
        concat_df['question_word_num_appearance_in_answer'][i] = question_answer_count

In [None]:
# stopwords
#!pip install nltk
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
eng_stopwords = set(stopwords.words('english'))

num_stopwords = lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords])
concat_df['title_num_stopwords'] = concat_df['question_title'].apply(num_stopwords)
concat_df['question_num_stopwords'] = concat_df['question_body'].apply(num_stopwords)
concat_df['answer_num_stopwords'] = concat_df['answer'].apply(num_stopwords)

In [None]:
# punctuations
import string
num_punctuations = lambda x: len([c for c in str(x) if c in string.punctuation])
concat_df['title_num_punctuations'] = concat_df['question_title'].apply(num_punctuations)
concat_df['question_num_punctuations'] = concat_df['question_body'].apply(num_punctuations)
concat_df['answer_num_punctuations'] = concat_df['answer'].apply(num_punctuations)

In [None]:
# upper case words
num_uppers = lambda x: len([w for w in str(x).split() if w.isupper()])
concat_df['title_num_uppers'] = concat_df['question_title'].apply(num_uppers)
concat_df['question_num_uppers'] = concat_df['question_body'].apply(num_uppers)
concat_df['answer_num_uppers'] = concat_df['answer'].apply(num_uppers)

In [None]:
# titles
num_titles = lambda x: len([s for s in str(x).split('. ') if s.istitle()])
concat_df['title_num_titles'] = concat_df['question_title'].apply(num_titles)
concat_df['question_num_titles'] = concat_df['question_body'].apply(num_titles)
concat_df['answer_num_titles'] = concat_df['answer'].apply(num_titles)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

numeric_cols = ['title_length', 'question_length', 'answer_length', 'title_word_num_appearance_in_question', 'title_word_num_appearance_in_answer',
                'question_word_num_appearance_in_answer', 'title_num_stopwords', 'question_num_stopwords', 'answer_num_stopwords', 'title_num_punctuations',
                'question_num_punctuations', 'answer_num_punctuations', 'title_num_uppers', 'question_num_uppers', 'answer_num_uppers', 'title_num_titles', 'question_num_titles',
                'answer_num_titles', 'title_num_unique_words', 'question_num_unique_words', 'answer_num_unique_words']
std_scaler = MinMaxScaler(feature_range=(-1,1))
concat_df[numeric_cols] = std_scaler.fit_transform(concat_df[numeric_cols])

In [None]:
# sentiment analysis
from textblob import TextBlob

pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

concat_df['title_polarity'] = concat_df['question_title'].apply(pol)
concat_df['title_subjectivity'] = concat_df['question_title'].apply(sub)
concat_df['question_polarity'] = concat_df['question_body'].apply(pol)
concat_df['question_subjectivity'] = concat_df['question_body'].apply(sub)
concat_df['answer_polarity'] = concat_df['answer'].apply(pol)
concat_df['answer_subjectivity'] = concat_df['answer'].apply(sub)

sentiment_features = ['title_polarity', 'title_subjectivity', 'question_polarity', 'question_subjectivity', 'answer_polarity', 'answer_subjectivity']

In [None]:
concat_dummies_df = pd.get_dummies(concat_df, columns=['category', 'host'])
print(concat_dummies_df.shape)

In [None]:
cat_cols = [col for col in concat_dummies_df.columns if ('host' in col) | ('category' in col)]
print(len(cat_cols))

In [None]:
train_dummies_df = concat_dummies_df.head(len(df_train))
test_dummies_df = concat_dummies_df.tail(len(df_test))
print(train_dummies_df.shape, test_dummies_df.shape)

In [None]:
additional_features = ['question_user==answer_user', 'title_word_is_in_question', 'title_word_is_in_answer', 'question_word_is_in_answer'] + cat_cols + sentiment_features\
+ numeric_cols
additional_features = sorted(additional_features)
print(len(additional_features))

# Bert & DNN

In [None]:
def compute_spearmanr_ignore_nan(trues, preds):
    rhos = []
    for tcol, pcol in zip(np.transpose(trues), np.transpose(preds)):
        rhos.append(spearmanr(tcol, pcol).correlation)
    return np.nanmean(rhos)

In [None]:
class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, valid_data, test_data, batch_size=16, fold=None):

        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        self.test_inputs = test_data
        
        self.batch_size = batch_size
        self.fold = fold
        
    def on_train_begin(self, logs={}):
        self.valid_predictions = []
        self.test_predictions = []
        
    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions.append(
            self.model.predict(self.valid_inputs, batch_size=self.batch_size))
        
        rho_val = compute_spearmanr_ignore_nan(
            self.valid_outputs, np.average(self.valid_predictions, axis=0))
        
        rho_val2 = compute_spearmanr_ignore_nan(self.valid_outputs, self.valid_predictions[-1])
        
        print("\nvalidation rho: %.4f" % rho_val)
        
        print('\nvalidation rho2:', rho_val2)
        
        if self.fold is not None:
            self.model.save_weights(f'bert-base-{fold}-{epoch}.h5py')
        
        if epoch > 1:
            self.test_predictions.append(
                self.model.predict(self.test_inputs, batch_size=self.batch_size)
            )

In [None]:
'''q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)

q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)

q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)

config = BertConfig() # print(config) to see settings
config.output_hidden_states = False # Set to True to obtain hidden states
# caution: when using e.g. XLNet, XLNetConfig() will automatically use xlnet-large config

# normally ".from_pretrained('bert-base-uncased')", but because of no internet, the 
# pretrained model has been downloaded manually and uploaded to kaggle. 
bert_model = TFBertModel.from_pretrained(BERT_PATH+'bert-base-uncased-tf_model.h5', config=config)

# if config.output_hidden_states = True, obtain hidden states via bert_model(...)[-1]
out = bert_model(q_id, attention_mask=q_mask, token_type_ids=q_atn)

#hmm = tf.keras.layers.GlobalAveragePooling1D()(out[0])

model = tf.keras.models.Model(inputs=[q_id, q_mask, q_atn], outputs=out)'''

In [None]:
gkf = GroupKFold(n_splits=5).split(X=df_train.question_body, groups=df_train.question_body)

valid_preds = []
test_preds = []
batch_size = 6
histories = []
for fold, (train_idx, valid_idx) in enumerate(gkf):
    if fold in [0, 2, 4]:
        train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
        train_outputs = outputs[train_idx]

        valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
        valid_outputs = outputs[valid_idx]

        # additional features
        train_data = train_dummies_df.iloc[train_idx]
        val_data = train_dummies_df.iloc[valid_idx]

        train_add_features = train_data[additional_features].values
        val_add_features = val_data[additional_features].values
        test_add_features = test_dummies_df[additional_features].values

        # model
        K.clear_session()

        # bert layer inputs
        q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
        a_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)

        q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
        a_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)

        q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
        a_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)

        # additional features inputs
        add_f_in = tf.keras.layers.Input(shape=(train_add_features.shape[1],))

        config = BertConfig() # print(config) to see settings
        config.output_hidden_states = True # Set to True to obtain hidden states
        # caution: when using e.g. XLNet, XLNetConfig() will automatically use xlnet-large config

        # normally ".from_pretrained('bert-base-uncased')", but because of no internet, the 
        # pretrained model has been downloaded manually and uploaded to kaggle. 
        bert_model = TFBertModel.from_pretrained(BERT_PATH+'bert-base-uncased-tf_model.h5', config=config)

        # if config.output_hidden_states = True, obtain hidden states via bert_model(...)[-1]
        q_hidden = bert_model(q_id, attention_mask=q_mask, token_type_ids=q_atn)[-1]
        a_hidden = bert_model(a_id, attention_mask=a_mask, token_type_ids=a_atn)[-1]

        q = tf.keras.layers.Average()([q_hidden[-i-1] for i in range(4)])
        a = tf.keras.layers.Average()([a_hidden[-i-1] for i in range(4)])

        q = tf.keras.layers.GlobalAveragePooling1D()(q)
        a = tf.keras.layers.GlobalAveragePooling1D()(a)

        #x = tf.keras.layers.Dense(8, activation='relu')(add_f_in)

        x = tf.keras.layers.Concatenate()([q, a])

        x = tf.keras.layers.Dropout(0.2)(x)

        #x = tf.keras.layers.Dense(128, activation='relu')(x)

        #x = tf.keras.layers.Dropout(0.25)(x)

        x = tf.keras.layers.Dense(30, activation='sigmoid')(x)

        model = tf.keras.models.Model(inputs=[q_id, q_mask, q_atn, a_id, a_mask, a_atn, add_f_in], outputs=x)

        custom_callback = CustomCallback(
            valid_data=(valid_inputs + [val_add_features], valid_outputs), 
            test_data=test_inputs + [test_add_features],
            batch_size=batch_size,
            fold=None
        )

        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
        model.compile(loss='binary_crossentropy', optimizer=optimizer)
        display(model.summary())

        # train
        history = model.fit(train_inputs + [train_add_features], train_outputs, epochs=3, batch_size=batch_size, validation_data=(valid_inputs + [val_add_features], valid_outputs),
                            callbacks=[custom_callback]
                           )

        model.save_weights('model{}.h5'.format(fold))

        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.show()
        plt.clf()

        valid_preds.append(model.predict(valid_inputs + [val_add_features]))
        test_preds.append(model.predict(test_inputs + [test_add_features]))

        rho_val = compute_spearmanr_ignore_nan(valid_outputs, valid_preds[-1])
        print('validation score = ', rho_val)

        histories.append(custom_callback)

#### 6. Process and submit test predictions

Average fold predictions, then save as `submission.csv`

In [None]:
test_predictions = [histories[i].test_predictions for i in range(len(histories))]
test_predictions = [np.average(test_predictions[i], axis=0) for i in range(len(test_predictions))]
test_predictions = np.mean(test_predictions, axis=0)

df_sub.iloc[:, 1:] = test_predictions

df_sub.to_csv('submission.csv', index=False)