In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as up
import pandas as pd
from tensorflow.keras import layers
import bert
import random

In [2]:
import numpy as np
import pandas as pd
import pickle as cPickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

# string processing
import re
from keras.utils.np_utils import to_categorical
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection, metrics


In [3]:
# to navigate to the data location
import os

# get current directory 
path = os.getcwd() 

# parent directory
parent = os.path.dirname(path)

df_merge_quality = pd.read_csv(parent + '/data/US_patent_abstract_5000_2015_with_title_1_5y.csv')
df_merge_quality.shape

(5000, 30)

In [4]:
df = df_merge_quality[['claims_text', 'quality_rank']]
df

Unnamed: 0,claims_text,quality_rank
0,What is claimed is: \n \n 1. An invi...,0
1,What is claimed is: \n \n 1. An impl...,0
2,What is claimed is: \n \n 1. A spear...,1
3,1. A computer-implemented method for the autom...,1
4,What is claimed is: \n \n 1. A semic...,0
...,...,...
4995,What is claimed is: \n \n 1. A compu...,1
4996,What is claimed is: \n \n 1. A displ...,1
4997,What is claimed is: \n \n 1. A compo...,1
4998,The invention claimed is: \n \n 1. A...,1


### Prep for Data

In [5]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [6]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [7]:
claims = []
sentences = list(df['claims_text'])
for sen in sentences:
    claims.append(preprocess_text(str(sen)))

In [8]:
claims[0]

'What is claimed is An invitation information push method comprising after receiving an invitation request sent by microblog user server sending invitation information to clients corresponding to number of invited users carried in the invitation request wherein the invited users are users who have not registered microblog and the number of the invited users is greater than or equal to and upon receiving the invitation information each client creating an invitation information guide to guide the user who has not registered microblog to register microblog wherein the server sending the invitation information to the clients corresponding to the invited users comprises determining by the server whether one or more of the invited users carried in the invitation request are in restricted list and if none of the invited users carried in the invitation request is in the restricted list sending the invitation information to the clients corresponding to the invited users carried in the invitatio

In [9]:
data_labels = df.quality_rank.values
data_labels

array([0, 0, 1, ..., 1, 1, 1])

In [10]:
# Prepare for data label
data_labels = to_categorical(df.quality_rank.values)
data_labels

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [11]:
# BERT tokenizer
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [12]:
def encode_sentence(sent, max_seq_length):
    if len(sent) <= max_seq_length:
        return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]
    else: # BERT limited to 512 tokens
        return ["[CLS]"] + tokenizer.tokenize(sent)[:max_seq_length] + ["[SEP]"]

In [13]:
# BERT takes maximum 512 sequence

MAX_SEQ_LEN=510

tokenized_claims = [encode_sentence(sentence, MAX_SEQ_LEN) for sentence in claims]

In [14]:
tokenized_claims[:2]

[['[CLS]',
  'what',
  'is',
  'claimed',
  'is',
  'an',
  'invitation',
  'information',
  'push',
  'method',
  'comprising',
  'after',
  'receiving',
  'an',
  'invitation',
  'request',
  'sent',
  'by',
  'micro',
  '##bl',
  '##og',
  'user',
  'server',
  'sending',
  'invitation',
  'information',
  'to',
  'clients',
  'corresponding',
  'to',
  'number',
  'of',
  'invited',
  'users',
  'carried',
  'in',
  'the',
  'invitation',
  'request',
  'wherein',
  'the',
  'invited',
  'users',
  'are',
  'users',
  'who',
  'have',
  'not',
  'registered',
  'micro',
  '##bl',
  '##og',
  'and',
  'the',
  'number',
  'of',
  'the',
  'invited',
  'users',
  'is',
  'greater',
  'than',
  'or',
  'equal',
  'to',
  'and',
  'upon',
  'receiving',
  'the',
  'invitation',
  'information',
  'each',
  'client',
  'creating',
  'an',
  'invitation',
  'information',
  'guide',
  'to',
  'guide',
  'the',
  'user',
  'who',
  'has',
  'not',
  'registered',
  'micro',
  '##bl',
  '#

In [15]:
np.array(tokenized_claims).shape

(5000,)

In [16]:
# Prepre the 3 inputs required by BERT deriving from the original data

def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)  # if not equal [PAD] then assign 1. 

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # turns 1 into 0 and vice versa
    return seg_ids

Below to create padded batches (so there could have different sentence length between batches, but will be same sentence length within batch) this is to save processing memory.

In [17]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(tokenized_claims)]

In [18]:
# split to train and test

data_with_len_train = data_with_len[:4000]
data_with_len_test = data_with_len[4000:]

In [19]:
data_with_len_train[0]

[['[CLS]',
  'what',
  'is',
  'claimed',
  'is',
  'an',
  'invitation',
  'information',
  'push',
  'method',
  'comprising',
  'after',
  'receiving',
  'an',
  'invitation',
  'request',
  'sent',
  'by',
  'micro',
  '##bl',
  '##og',
  'user',
  'server',
  'sending',
  'invitation',
  'information',
  'to',
  'clients',
  'corresponding',
  'to',
  'number',
  'of',
  'invited',
  'users',
  'carried',
  'in',
  'the',
  'invitation',
  'request',
  'wherein',
  'the',
  'invited',
  'users',
  'are',
  'users',
  'who',
  'have',
  'not',
  'registered',
  'micro',
  '##bl',
  '##og',
  'and',
  'the',
  'number',
  'of',
  'the',
  'invited',
  'users',
  'is',
  'greater',
  'than',
  'or',
  'equal',
  'to',
  'and',
  'upon',
  'receiving',
  'the',
  'invitation',
  'information',
  'each',
  'client',
  'creating',
  'an',
  'invitation',
  'information',
  'guide',
  'to',
  'guide',
  'the',
  'user',
  'who',
  'has',
  'not',
  'registered',
  'micro',
  '##bl',
  '#

In [20]:
np.array(data_with_len_train).shape

(4000, 3)

In [21]:
np.unique(np.array(data_with_len_train)[:, 2], return_counts = True)
# majority are in 512 and above

(array([28, 39, 49, 59, 72, 83, 89, 101, 108, 112, 113, 120, 123, 128, 134,
        142, 145, 147, 149, 150, 151, 152, 154, 155, 157, 158, 159, 163,
        164, 165, 166, 167, 169, 176, 177, 178, 181, 182, 183, 187, 189,
        191, 192, 193, 194, 200, 201, 203, 204, 205, 206, 208, 209, 210,
        213, 215, 216, 217, 218, 219, 220, 222, 223, 224, 227, 228, 229,
        231, 232, 233, 235, 236, 237, 238, 239, 241, 244, 245, 246, 247,
        249, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 262, 263,
        264, 267, 268, 269, 271, 272, 273, 275, 276, 277, 278, 279, 280,
        281, 282, 283, 284, 286, 287, 288, 291, 292, 293, 295, 296, 297,
        298, 299, 300, 301, 302, 303, 304, 305, 306, 308, 309, 311, 313,
        315, 316, 317, 318, 319, 320, 321, 322, 323, 326, 327, 328, 329,
        330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342,
        343, 344, 345, 346, 348, 349, 350, 352, 355, 356, 357, 358, 359,
        360, 361, 362, 363, 364, 365, 366, 367, 

In [22]:
def to_batch(data_with_len, BATCH_SIZE):
    data_with_len = sorted(data_with_len, key=lambda x: x[2])
    sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len] 
    
    train_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))
    

    train_batched = train_dataset.padded_batch(BATCH_SIZE, # this is the pre-processed input to feed into BERT embedding layer and DCNN model
                                       padded_shapes=((3, None), [2]), # oh, here is the key, need to change from () to [2] now that I pass in the one-hot label!
                                       padding_values=(0, 0)) # drop_remainder = True (optional)
    return train_batched

In [23]:
# prepare the batches for train set:
train_batched = to_batch(data_with_len_train, 32)

In [24]:
train_batched # note: if above use drop_remainder = True, then => shapes: ((32, 3, None), (32, 2)), types: (tf.int32, tf.int32)>  because it would only be "None" if any single item is inconsistent in shape with the rest

<PaddedBatchDataset shapes: ((None, 3, None), (None, 2)), types: (tf.int32, tf.int32)>

In [25]:
next(iter(train_batched))


(<tf.Tensor: shape=(32, 3, 158), dtype=int32, numpy=
 array([[[ 101, 2054, 2003, ...,    0,    0,    0],
         [   1,    1,    1, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0]],
 
        [[ 101, 1045, 4366, ...,    0,    0,    0],
         [   1,    1,    1, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0]],
 
        [[ 101, 6064, 9570, ...,    0,    0,    0],
         [   1,    1,    1, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0]],
 
        ...,
 
        [[ 101, 2019, 7275, ...,    0,    0,    0],
         [   1,    1,    1, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0]],
 
        [[ 101, 2054, 2003, ..., 5051,  102,    0],
         [   1,    1,    1, ...,    1,    1,    0],
         [   0,    0,    0, ...,    0,    0,    0]],
 
        [[ 101, 2054, 2003, ..., 4984, 2604,  102],
         [   1,    1,    1, ...,    1,    1,    1],
         [   0,    0,    0, ..., 

In [26]:
# prepare the batches for evaluation set:
test_batched = to_batch(data_with_len_test, 32)

In [27]:
next(iter(test_batched))

(<tf.Tensor: shape=(32, 3, 233), dtype=int32, numpy=
 array([[[  101, 17782,  2594, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  1996, 11028, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2054,  2003, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        ...,
 
        [[  101,  4118,  2005, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  1996, 11028, ...,   102,     0,     0],
         [    1,     1,     1, ...,     1,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2054,  2003, ..., 17225, 1361

In [28]:
# double check the label distribution
# Validation set
count = 0
sum_positive = 0
for element in test_batched:
    count += len(element[1])
    sum_positive += sum(element[1])

print(f'Sample Count:{count}, Positive Label Count:{sum_positive}, Positive Class Ratio:{sum_positive/count}, Negative Class Ratio = {1-sum_positive/count}')


Sample Count:1000, Positive Label Count:[595 405], Positive Class Ratio:[0.595 0.405], Negative Class Ratio = [0.405 0.595]


In [29]:
# double check the label distribution
# Train set
count = 0
sum_positive = 0
for element in train_batched:
    count += len(element[1])
    sum_positive += sum(element[1])

print(f'Sample Count:{count}, Positive Label Count:{sum_positive}, Positive Class Ratio:{sum_positive/count}, Negative Class Ratio = {1-sum_positive/count}')


Sample Count:4000, Positive Label Count:[2397 1603], Positive Class Ratio:[0.59925 0.40075], Negative Class Ratio = [0.40075 0.59925]


### Model Building

In [30]:
class DCNNBERTEmbedding(tf.keras.Model):
    
    def __init__(self,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 name="dcnn"):
        super(DCNNBERTEmbedding, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=False)

        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=5,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=50,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=100,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=2,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def embed_with_bert(self, all_tokens):
        _, embs = self.bert_layer([all_tokens[:, 0, :],
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs

    def call(self, inputs, training):
        x = self.embed_with_bert(inputs)

        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

### Training

In [31]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [32]:
Dcnn = DCNNBERTEmbedding(nb_filters=NB_FILTERS,
                         FFN_units=FFN_UNITS,
                         nb_classes=NB_CLASSES,
                         dropout_rate=DROPOUT_RATE)

In [33]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [34]:
checkpoint_path = "ckpt_BERT_CNN_US_5years/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=2)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [35]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [None]:
# fitting time
Dcnn.fit(train_batched,
         validation_data = (test_batched),
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
    125/Unknown - 2258s 18s/step - loss: 1.1705 - accuracy: 0.5658Checkpoint saved at ckpt_BERT_CNN_US_5years/.
Epoch 2/5
Epoch 3/5

In [38]:
# disconnected, re-start from epoch = 3
# fitting time
Dcnn.fit(train_batched,
         validation_data = (test_batched),
         epochs=3,
         callbacks=[MyCustomCallback()])

Epoch 1/3
    125/Unknown - 2310s 18s/step - loss: 0.6320 - accuracy: 0.6565Checkpoint saved at ckpt_BERT_CNN_US_5years/.
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f91afdae490>

In [36]:
# to restore checkpoint 3

checkpoint_path = "ckpt_BERT_CNN_US_5years/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=2)

if ckpt_manager.latest_checkpoint:
    ckpt.restore('ckpt_BERT_CNN_US_5years/ckpt-3')
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [37]:
# double check the label distribution
# Validation set
count = 0
sum_positive = 0
for element in test_batched:
    count += len(element[1])
    sum_positive += sum(element[1])

print(f'Sample Count:{count}, Positive Label Count:{sum_positive}, Positive Class Ratio:{sum_positive/count}, Negative Class Ratio = {1-sum_positive/count}')


Sample Count:1000, Positive Label Count:[595 405], Positive Class Ratio:[0.595 0.405], Negative Class Ratio = [0.405 0.595]


In [38]:
# modify from original to_batch in order to not sort to different order because for ensemble we need to preserve the same order among all models
def prediction_to_batch(data_with_len, BATCH_SIZE):
    # data_with_len = sorted(data_with_len, key=lambda x: x[2])
    sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len] 
    
    train_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))
    

    train_batched = train_dataset.padded_batch(BATCH_SIZE, # this is the pre-processed input to feed into BERT embedding layer and DCNN model
                                       padded_shapes=((3, 512), [2]), # make from None to 512 fixed dimension
                                       padding_values=(0, 0)) # drop_remainder = True (optional)
    return train_batched

In [39]:
# prepare the batches for evaluation set:
test_batched_prediction = prediction_to_batch(data_with_len_test, 32)

In [40]:
next(iter(test_batched_prediction))

(<tf.Tensor: shape=(32, 3, 512), dtype=int32, numpy=
 array([[[  101,  2054,  2003, ...,  1997,  1996,   102],
         [    1,     1,     1, ...,     1,     1,     1],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2054,  2003, ...,  2011,  1996,   102],
         [    1,     1,     1, ...,     1,     1,     1],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2057,  4366, ..., 17435, 19251,   102],
         [    1,     1,     1, ...,     1,     1,     1],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        ...,
 
        [[  101,  2054,  2003, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2054,  2003, ..., 23820,  1996,   102],
         [    1,     1,     1, ...,     1,     1,     1],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2054,  2003, ...,  5080, 1063

In [41]:
pred_test = Dcnn.predict(test_batched_prediction)
pred_test

array([[0.71562785, 0.30147487],
       [0.6611862 , 0.3688208 ],
       [0.41553974, 0.5637231 ],
       ...,
       [0.4392148 , 0.54302084],
       [0.39951873, 0.5568684 ],
       [0.31424057, 0.6504709 ]], dtype=float32)

In [42]:
pred_test.shape

(1000, 2)

In [43]:
np.savetxt('Predict_Output/BERT_CNN_5yr_claims_dev_prob.csv', pred_test)

In [44]:
predicted = [np.argmax(pred) for pred in 
             pred_test]

In [45]:
y_test_binary = df['quality_rank'][4000:].values


In [46]:
## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test_binary, predicted)
auc = metrics.roc_auc_score(y_test_binary, predicted)  # predicted_prob), check doc, seems the second argument required to be shape (n_samples,) for binary case 
                            #multi_class="ovr") # check documentation and seems "ovr" not good for only binary target class
print("Accuracy:",  round(accuracy,3))
print("Auc:", round(auc,3))
print("Detail:")
print(metrics.classification_report(y_test_binary, predicted))



Accuracy: 0.604
Auc: 0.573
Detail:
              precision    recall  f1-score   support

           0       0.65      0.73      0.69       595
           1       0.51      0.41      0.46       405

    accuracy                           0.60      1000
   macro avg       0.58      0.57      0.57      1000
weighted avg       0.59      0.60      0.59      1000

