In [1]:
# Use the official tokenization script created by the Google team. Ensure GPU & Internet is turned on for this kernel
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
    
#!wget https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

Import all the necessary libraries

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import math
import gc
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
    
import tokenization
print("Import libraries complete...")

Import libraries complete...


> Define Helper Functions

In [3]:
def memory_usage_mb(df, *args, **kwargs):
    """Dataframe memory usage in MB. """
    return df.memory_usage(*args, **kwargs).sum() / 1024**2



def reduce_memory_usage(df, deep=True, verbose=True):
    # All types that we want to change for "lighter" ones.
    # int8 and float16 are not include because we cannot reduce
    # those data types.
    # float32 is not include because float16 has too low precision.
    numeric2reduce = ["int16", "int32", "int64", "float64"]
    start_mem = 0
    if verbose:
        start_mem = memory_usage_mb(df, deep=deep)

    for col, col_type in df.dtypes.iteritems():
        best_type = None
        if col_type in numeric2reduce:
            downcast = "integer" if "int" in str(col_type) else "float"
            df[col] = pd.to_numeric(df[col], downcast=downcast)
            best_type = df[col].dtype.name
        # Log the conversion performed.
        if verbose and best_type is not None and best_type != str(col_type):
            print(f"Column '{col}' converted from {col_type} to {best_type}")
    
    if verbose:
        end_mem = memory_usage_mb(df, deep=deep)
        diff_mem = start_mem - end_mem
        percent_mem = 100 * diff_mem / start_mem
        print(f"Memory usage decreased from"
              f" {start_mem:.2f}MB to {end_mem:.2f}MB"
              f" ({diff_mem:.2f}MB, {percent_mem:.2f}% reduction)")
        
    return df



def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)



def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


print("Defined support functions...")

Defined support functions...


In [4]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv")
test_df  = pd.read_csv("../input/nlp-getting-started/test.csv")
sub_df   = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")


print("Train : ",train_df.shape)         # -- (7613, 5)
print("Test  : ", test_df.shape)         # -- (3263, 5)
print(train_df.target.unique())

test_df['target']=-1

# We create a full dataset with train and test values

df_full = pd.concat([train_df, test_df], sort=True)
print(df_full.shape)                     # -- (10876, 5)
print("Data load complete...")

Train :  (7613, 5)
Test  :  (3263, 4)
[1 0]
(10876, 5)
Data load complete...


In [5]:
# Determine percentage of missing values

print('id       : ' , round(len(df_full.loc[df_full['id'].isna()])/len(df_full) *100, 2), "% Missing values")
print('keyword  : ' , round(len(df_full.loc[df_full['keyword'].isna()])/len(df_full) *100, 2), "% Missing values ; ", round(len(df_full.loc[df_full['keyword'].isna()])), "Missing records")
print('location : ' , round(len(df_full.loc[df_full['location'].isna()])/len(df_full) *100, 2), "% Missing values ; ", round(len(df_full.loc[df_full['location'].isna()])), "Missing records")
print('text     : ' , round(len(df_full.loc[df_full['text'].isna()])/len(df_full) *100, 2), "% Missing values")

id       :  0.0 % Missing values
keyword  :  0.8 % Missing values ;  87 Missing records
location :  33.45 % Missing values ;  3638 Missing records
text     :  0.0 % Missing values


In [6]:
# Convert to lowercase
df_full['keyword'] = df_full['keyword'].str.lower()
df_full['location'] = df_full['location'].str.lower()
df_full['text'] = df_full['text'].str.lower()

print("Converted data to lowercase...")

Converted data to lowercase...


In [7]:
# Basic Data Cleanup 

df_full['keyword'] = df_full['keyword'].str.replace('%20',' ')
df_full['location'] = df_full['location'].str.replace('?','')
df_full['location'] = df_full['location'].str.replace('(','')
df_full['location'] = df_full['location'].str.replace(')','')
df_full['location'] = df_full['location'].str.replace('\x89û¢','')

#df_full.loc[df_full['loc2'].notnull()]

#Replace irrelevant words in location with 'unknown' 
trash_list = ['your','all around','world wide','trash', 'void', 'they', 'them','nowhere', ' bae', 'webcam'
                  'every', 'eatin','there','imagine','@','who','you', 'universe','club', 'peach','surv'] 

pattern = '|'.join(trash_list)     # joining list for comparision
df_full['loc1']=df_full['location'].str.contains(pattern)
df_full[df_full['loc1']==True]
df_full.loc[df_full['loc1']==True,'location'] = 'unknown'

del df_full['loc1']

print("Basic data cleanup complete...")

Basic data cleanup complete...


In [8]:
# Put 'unknown' for missing values
df_full.fillna(value='unknown', inplace=True)

df_full.loc[df_full['location'].str.contains('\*'),'location'] ='unknown'
df_full.loc[df_full['location'].str.contains('don\'t'),'location'] ='unknown'

print("Filled missing values to 'unknown'...")

Filled missing values to 'unknown'...


We check the number of unique values in those keyword and location fields:
* keyword  :  222 records
* location : 4522 records

In [9]:
keyword_list = df_full.keyword.unique()             # -- 222 records
location_list = df_full.location.unique()           # -- 4198 records

print(len(keyword_list),"unique keyword values")             # -- 222 unique keyword values
print(len(location_list),"unique location values")           # -- 4198 unique location values

222 unique keyword values
4198 unique location values


Reduce memory usage

In [10]:
train_df = df_full[df_full['target']!=-1]
test_df = df_full[df_full['target']==-1]

print(train_df.shape)
print(test_df.shape)
print(df_full.shape)

del df_full
del test_df['target']
reduce_memory_usage(train_df)
reduce_memory_usage(test_df)
gc.collect()


(7613, 5)
(3263, 5)
(10876, 5)
Column 'id' converted from int64 to int16
Column 'target' converted from int64 to int8
Memory usage decreased from 2.39MB to 2.30MB (0.09MB, 3.94% reduction)
Column 'id' converted from int64 to int16
Memory usage decreased from 1.01MB to 0.99MB (0.02MB, 1.86% reduction)


22

**Load and Preprocess**
* Load BERT from the Tensorflow Hub
* Load tokenizer from the bert layer
* Encode the text into tokens, masks, and segment flags

In [11]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 1min 32s, sys: 9.72 s, total: 1min 41s
Wall time: 1min 45s


In [12]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [13]:
train_input = bert_encode(train_df.text.values, tokenizer, max_len=160)
test_input = bert_encode(test_df.text.values, tokenizer, max_len=160)
train_labels = train_df.target.values

In [14]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [15]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=3,
    batch_size=16
)

model.save('bert_model.h5')

Train on 6090 samples, validate on 1523 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [16]:
test_pred = model.predict(test_input)

In [17]:
sub_df['target'] = test_pred.round().astype(int)
sub_df.to_csv('submission_bert.csv', index=False)