In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as up
import pandas as pd
from tensorflow.keras import layers
import bert
import random

In [2]:
import numpy as np
import pandas as pd
import pickle as cPickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

In [3]:
df_merge_quality = pd.read_csv('US_patent_abstract_5000_2015_with_title_1.csv')
df_merge_quality.shape

(5000, 30)

In [4]:
df = df_merge_quality[['claims_text', 'quality_rank']]

In [5]:
df.quality_rank.value_counts()

0    2896
1    2104
Name: quality_rank, dtype: int64

In [6]:
2896/5000

0.5792

In [7]:
df.shape

(5000, 2)

### Prep for Data

In [8]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [9]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [10]:
claims = []
sentences = list(df['claims_text'])
for sen in sentences:
    claims.append(preprocess_text(str(sen)))

In [11]:
claims[0]

' device comprising memory to store instructions where the device is first device and processor to execute the instructions to receive first message and second message from second device where the first message and the second message differ and where the first message includes first header and first event data and the second message includes second header and second event data identify based on the first header and the second header that the first message and the second messages are problem reports process in response to identifying the first message and the second messages as the problem reports the first event data to determine that the first event data in the first message is associated with particular reconfiguration information of plurality of reconfiguration information corresponding with the second device process in response to identifying the first message and the second messages as the problem reports the second event data to determine that the second event data in the second 

In [12]:
len(claims[0])

8033

In [13]:
print(df.columns.values)

['claims_text' 'quality_rank']


In [14]:
df.quality_rank.unique()

array([0, 1])

In [15]:
# Prepare for data label
data_labels = df.quality_rank.values

In [16]:
data_labels

array([0, 1, 0, ..., 0, 1, 0])

In [17]:
data_labels.shape

(5000,)

In [18]:
sum(data_labels[:4000])

1680

In [19]:
sum(data_labels[4000:])

424

In [20]:
1680 + 424

2104

In [21]:
# BERT tokenizer
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [22]:
def encode_sentence(sent, max_seq_length):
    if len(sent) <= max_seq_length:
        return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]
    else: # BERT limited to 512 tokens
        return ["[CLS]"] + tokenizer.tokenize(sent)[:max_seq_length] + ["[SEP]"]

In [23]:
# BERT takes maximum 512 sequence

MAX_SEQ_LEN=400

tokenized_claims = [encode_sentence(sentence, MAX_SEQ_LEN) for sentence in claims]

In [24]:
tokenized_claims[:2]

[['[CLS]',
  'device',
  'comprising',
  'memory',
  'to',
  'store',
  'instructions',
  'where',
  'the',
  'device',
  'is',
  'first',
  'device',
  'and',
  'processor',
  'to',
  'execute',
  'the',
  'instructions',
  'to',
  'receive',
  'first',
  'message',
  'and',
  'second',
  'message',
  'from',
  'second',
  'device',
  'where',
  'the',
  'first',
  'message',
  'and',
  'the',
  'second',
  'message',
  'differ',
  'and',
  'where',
  'the',
  'first',
  'message',
  'includes',
  'first',
  'header',
  'and',
  'first',
  'event',
  'data',
  'and',
  'the',
  'second',
  'message',
  'includes',
  'second',
  'header',
  'and',
  'second',
  'event',
  'data',
  'identify',
  'based',
  'on',
  'the',
  'first',
  'header',
  'and',
  'the',
  'second',
  'header',
  'that',
  'the',
  'first',
  'message',
  'and',
  'the',
  'second',
  'messages',
  'are',
  'problem',
  'reports',
  'process',
  'in',
  'response',
  'to',
  'identifying',
  'the',
  'first',
  

In [25]:
len(tokenized_claims)

5000

In [26]:
len(tokenized_claims[2])

402

In [27]:
# Prepre the 3 inputs required by BERT deriving from the original data

def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)  # if not equal [PAD] then assign 1. 

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # turns 1 into 0 and vice versa
    return seg_ids

Below to create padded batches (so there could have different sentence length between batches, but will be same sentence length within batch) this is to save processing memory.

In [28]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(tokenized_claims)]

In [29]:
# split to train and test

data_with_len_train = data_with_len[:4000]
data_with_len_test = data_with_len[4000:]

In [30]:
data_with_len_train[0]

[['[CLS]',
  'device',
  'comprising',
  'memory',
  'to',
  'store',
  'instructions',
  'where',
  'the',
  'device',
  'is',
  'first',
  'device',
  'and',
  'processor',
  'to',
  'execute',
  'the',
  'instructions',
  'to',
  'receive',
  'first',
  'message',
  'and',
  'second',
  'message',
  'from',
  'second',
  'device',
  'where',
  'the',
  'first',
  'message',
  'and',
  'the',
  'second',
  'message',
  'differ',
  'and',
  'where',
  'the',
  'first',
  'message',
  'includes',
  'first',
  'header',
  'and',
  'first',
  'event',
  'data',
  'and',
  'the',
  'second',
  'message',
  'includes',
  'second',
  'header',
  'and',
  'second',
  'event',
  'data',
  'identify',
  'based',
  'on',
  'the',
  'first',
  'header',
  'and',
  'the',
  'second',
  'header',
  'that',
  'the',
  'first',
  'message',
  'and',
  'the',
  'second',
  'messages',
  'are',
  'problem',
  'reports',
  'process',
  'in',
  'response',
  'to',
  'identifying',
  'the',
  'first',
  

In [31]:
np.array(data_with_len_train).shape

(4000, 3)

In [32]:
# check the sentence length
np.array(data_with_len_train)[:, 2]

array([402, 352, 402, ..., 402, 402, 402], dtype=object)

In [33]:
np.unique(np.array(data_with_len_train)[:, 2], return_counts = True)
# majority are in 512 and above

(array([22, 47, 50, 69, 84, 89, 95, 99, 100, 108, 112, 116, 117, 118, 131,
        132, 136, 137, 141, 142, 143, 144, 147, 148, 150, 155, 159, 160,
        162, 164, 165, 168, 173, 174, 175, 176, 178, 180, 181, 182, 184,
        185, 186, 187, 188, 191, 193, 194, 195, 196, 197, 198, 199, 201,
        202, 203, 205, 206, 207, 208, 209, 212, 213, 215, 216, 217, 219,
        220, 222, 224, 225, 226, 227, 228, 229, 230, 231, 233, 234, 235,
        237, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250,
        251, 254, 255, 256, 259, 260, 261, 262, 264, 265, 267, 268, 269,
        270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 283, 284,
        286, 287, 288, 290, 291, 292, 293, 295, 296, 297, 298, 299, 300,
        301, 302, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
        316, 318, 320, 321, 322, 323, 324, 326, 327, 328, 329, 331, 332,
        333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345,
        346, 347, 348, 349, 350, 351, 352, 353, 3

In [34]:
def to_batch(data_with_len, BATCH_SIZE):
    data_with_len = sorted(data_with_len, key=lambda x: x[2])
    sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len] 
    
    train_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))
    

    train_batched = train_dataset.padded_batch(BATCH_SIZE, # this is the pre-processed input to feed into BERT embedding layer and DCNN model
                                       padded_shapes=((3, None), ()),
                                       padding_values=(0, 0))
    return train_batched

In [35]:
train_batched = to_batch(data_with_len_train, 32)

In [36]:
next(iter(train_batched))

(<tf.Tensor: shape=(32, 3, 147), dtype=int32, numpy=
 array([[[  101,  2054,  2003, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2019,  7275, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  1045,  4366, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        ...,
 
        [[  101,  2054,  2003, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  1996, 11028, ...,  8945,  4948,   102],
         [    1,     1,     1, ...,     1,     1,     1],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  1996, 11028, ...,  2003, 2547

In [37]:
# prepare the batches for evaluation set:
test_batched = to_batch(data_with_len_test, 32)

In [38]:
next(iter(test_batched))

(<tf.Tensor: shape=(32, 3, 256), dtype=int32, numpy=
 array([[[  101, 23767,  2278, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  1996, 11028, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  4118,  2005, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        ...,
 
        [[  101,  1996, 11028, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2054,  2003, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  1996, 11028, ...,  1996,  638

In [39]:
# double check the label distribution
# Validation set
count = 0
sum_positive = 0
for element in test_batched:
    count += len(element[1])
    sum_positive += sum(element[1])

print(f'Sample Count:{count}, Positive Label Count:{sum_positive}, Positive Class Ratio:{sum_positive/count}, Negative Class Ratio = {1-sum_positive/count}')


Sample Count:1000, Positive Label Count:424, Positive Class Ratio:0.424, Negative Class Ratio = 0.5760000000000001


In [40]:
# double check the label distribution
# Train set
count = 0
sum_positive = 0
for element in train_batched:
    count += len(element[1])
    sum_positive += sum(element[1])

print(f'Sample Count:{count}, Positive Label Count:{sum_positive}, Positive Class Ratio:{sum_positive/count}, Negative Class Ratio = {1-sum_positive/count}')


Sample Count:4000, Positive Label Count:1680, Positive Class Ratio:0.42, Negative Class Ratio = 0.5800000000000001


### Model Building

In [41]:
class DCNNBERTEmbedding(tf.keras.Model):
    
    def __init__(self,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 name="dcnn"):
        super(DCNNBERTEmbedding, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=True)

        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=5,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=50,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=100,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def embed_with_bert(self, all_tokens):
        _, embs = self.bert_layer([all_tokens[:, 0, :],
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs

    def call(self, inputs, training):
        x = self.embed_with_bert(inputs)

        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

### Training

In [42]:
NB_FILTERS = 80
FFN_UNITS = 120
NB_CLASSES = 2

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [43]:
Dcnn = DCNNBERTEmbedding(nb_filters=NB_FILTERS,
                         FFN_units=FFN_UNITS,
                         nb_classes=NB_CLASSES,
                         dropout_rate=DROPOUT_RATE)

In [44]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [45]:
checkpoint_path = "ckpt_bert_embedding_fine_tune/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=2)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [46]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [47]:
# fitting time
Dcnn.fit(train_batched,
         validation_data = (test_batched),
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
    125/Unknown - 4127s 33s/step - loss: 0.9938 - accuracy: 0.5422Checkpoint saved at ckpt_bert_embedding_fine_tune/.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f411fa3ed90>

In [39]:
# to train more => re-start from the 4th epoch 
# fitting time
Dcnn.fit(train_batched,
         validation_data = (test_batched),
         epochs=2,
         callbacks=[MyCustomCallback()])

Epoch 1/2
    125/Unknown - 3842s 31s/step - loss: 0.2873 - accuracy: 0.8870Checkpoint saved at ckpt_bert_embedding/.
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fa65754c710>

In [47]:
# re-start the training again, from epoch 6
Dcnn.fit(train_batched,
         validation_data = (test_batched),
         epochs=2,
         callbacks=[MyCustomCallback()])

Epoch 1/2
    125/Unknown - 3744s 30s/step - loss: 0.3938 - accuracy: 0.8142Checkpoint saved at ckpt_bert_embedding/.
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fb96cd61f10>

### Evaluation

In [48]:
results = Dcnn.evaluate(test_batched)
print(results)

[0.9070994853973389, 0.6129999756813049]
