In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as up
import pandas as pd
from tensorflow.keras import layers
import bert
import random

In [2]:
import numpy as np
import pandas as pd
import pickle as cPickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

# string processing
import re
from keras.utils.np_utils import to_categorical
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection, metrics


In [3]:
# to navigate to the data location
import os

# get current directory 
path = os.getcwd() 

# parent directory
parent = os.path.dirname(path)

df_merge_quality = pd.read_csv(parent + '/data/US_patent_abstract_5000_2015_with_title_1_5y.csv')
df_merge_quality.shape

(5000, 30)

In [4]:
df = df_merge_quality[['text', 'quality_rank']]
df

Unnamed: 0,text,quality_rank
0,Invitation information push method and system....,0
1,Coronal angulating connector. A connector is p...,0
2,Spearfishing apparatus. A device for spearfish...,1
3,Systems and methods for prioritizing media fil...,1
4,Semiconductor integrated circuit. A semiconduc...,0
...,...,...
4995,Cross-platform cloud-based map creation. Metho...,1
4996,Display substrate. A display substrate include...,1
4997,Aminoquinazoline derivatives and their salts a...,1
4998,Method and device for displaying information i...,1


### Prep for Data

In [5]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [6]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [7]:
claims = []
sentences = list(df['text'])
for sen in sentences:
    claims.append(preprocess_text(str(sen)))

In [8]:
claims[0]

'Invitation information push method and system An invitation information push method includes after receiving an invitation request sent by microblog user server sending invitation information to number of clients corresponding to invited users carried in the invitation request wherein the invited users are users who have not registered microblog and the number of the invited users is greater than or equal to Each client upon receiving the invitation information creating an invitation information guide to guide the users who have not registered the microblog to register the microblog The method further comprises when predetermined time is reached server actively sending invitation information to at least one client corresponding to at least one user who has not registered the microblog '

In [9]:
# Prepare for data label
data_labels = to_categorical(df.quality_rank.values)
data_labels

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [10]:
# BERT tokenizer
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [11]:
def encode_sentence(sent, max_seq_length):
    if len(sent) <= max_seq_length:
        return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]
    else: # BERT limited to 512 tokens
        return ["[CLS]"] + tokenizer.tokenize(sent)[:max_seq_length] + ["[SEP]"]

In [12]:
# BERT takes maximum 512 sequence

MAX_SEQ_LEN=510

tokenized_claims = [encode_sentence(sentence, MAX_SEQ_LEN) for sentence in claims]

In [13]:
tokenized_claims[:2]

[['[CLS]',
  'invitation',
  'information',
  'push',
  'method',
  'and',
  'system',
  'an',
  'invitation',
  'information',
  'push',
  'method',
  'includes',
  'after',
  'receiving',
  'an',
  'invitation',
  'request',
  'sent',
  'by',
  'micro',
  '##bl',
  '##og',
  'user',
  'server',
  'sending',
  'invitation',
  'information',
  'to',
  'number',
  'of',
  'clients',
  'corresponding',
  'to',
  'invited',
  'users',
  'carried',
  'in',
  'the',
  'invitation',
  'request',
  'wherein',
  'the',
  'invited',
  'users',
  'are',
  'users',
  'who',
  'have',
  'not',
  'registered',
  'micro',
  '##bl',
  '##og',
  'and',
  'the',
  'number',
  'of',
  'the',
  'invited',
  'users',
  'is',
  'greater',
  'than',
  'or',
  'equal',
  'to',
  'each',
  'client',
  'upon',
  'receiving',
  'the',
  'invitation',
  'information',
  'creating',
  'an',
  'invitation',
  'information',
  'guide',
  'to',
  'guide',
  'the',
  'users',
  'who',
  'have',
  'not',
  'registered

In [14]:
np.array(tokenized_claims).shape

(5000,)

In [15]:
# Prepre the 3 inputs required by BERT deriving from the original data

def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)  # if not equal [PAD] then assign 1. 

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # turns 1 into 0 and vice versa
    return seg_ids

Below to create padded batches (so there could have different sentence length between batches, but will be same sentence length within batch) this is to save processing memory.

In [16]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(tokenized_claims)]

In [17]:
# split to train and test

data_with_len_train = data_with_len[:4000]
data_with_len_test = data_with_len[4000:]

In [18]:
data_with_len_train[0]

[['[CLS]',
  'invitation',
  'information',
  'push',
  'method',
  'and',
  'system',
  'an',
  'invitation',
  'information',
  'push',
  'method',
  'includes',
  'after',
  'receiving',
  'an',
  'invitation',
  'request',
  'sent',
  'by',
  'micro',
  '##bl',
  '##og',
  'user',
  'server',
  'sending',
  'invitation',
  'information',
  'to',
  'number',
  'of',
  'clients',
  'corresponding',
  'to',
  'invited',
  'users',
  'carried',
  'in',
  'the',
  'invitation',
  'request',
  'wherein',
  'the',
  'invited',
  'users',
  'are',
  'users',
  'who',
  'have',
  'not',
  'registered',
  'micro',
  '##bl',
  '##og',
  'and',
  'the',
  'number',
  'of',
  'the',
  'invited',
  'users',
  'is',
  'greater',
  'than',
  'or',
  'equal',
  'to',
  'each',
  'client',
  'upon',
  'receiving',
  'the',
  'invitation',
  'information',
  'creating',
  'an',
  'invitation',
  'information',
  'guide',
  'to',
  'guide',
  'the',
  'users',
  'who',
  'have',
  'not',
  'registered

In [19]:
np.array(data_with_len_train).shape

(4000, 3)

In [20]:
np.unique(np.array(data_with_len_train)[:, 2], return_counts = True)
# majority are in 512 and above

(array([13, 23, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40,
        41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
        58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
        75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
        92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
        107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
        120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
        133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
        146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
        159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171,
        172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
        185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
        198, 199, 200, 201, 202, 204, 205, 206, 207, 208, 209, 210, 211,
        212, 213, 214, 215, 216, 217,

In [21]:
def to_batch(data_with_len, BATCH_SIZE):
    data_with_len = sorted(data_with_len, key=lambda x: x[2])
    sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len] 
    
    train_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))
    

    train_batched = train_dataset.padded_batch(BATCH_SIZE, # this is the pre-processed input to feed into BERT embedding layer and DCNN model
                                       padded_shapes=((3, 512), [2]), # since our label is one-hot encoding format, need to use [2] rather than ()
                                       padding_values=(0, 0))
    return train_batched

In [22]:
# prepare the batches for train set:
train_batched = to_batch(data_with_len_train, 32)

In [23]:
next(iter(train_batched))

(<tf.Tensor: shape=(32, 3, 512), dtype=int32, numpy=
 array([[[  101, 17261, 10099, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101, 16175,  5162, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2422, 12495, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        ...,
 
        [[  101, 17782, 21335, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2835,  3295, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101, 28406,  1997, ...,     0,     

In [24]:
# prepare the batches for evaluation set:
test_batched = to_batch(data_with_len_test, 32)

In [25]:
next(iter(test_batched))

(<tf.Tensor: shape=(32, 3, 512), dtype=int32, numpy=
 array([[[  101, 19160,  8332, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  9265,  2005, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  4725,  2005, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        ...,
 
        [[  101, 22160,  2102, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  2224,  1997, ...,     0,     0,     0],
         [    1,     1,     1, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0]],
 
        [[  101,  5080,  1998, ...,     0,     

In [26]:
# double check the label distribution
# Validation set
count = 0
sum_positive = 0
for element in test_batched:
    count += len(element[1])
    sum_positive += sum(element[1])

print(f'Sample Count:{count}, Positive Label Count:{sum_positive}, Positive Class Ratio:{sum_positive/count}, Negative Class Ratio = {1-sum_positive/count}')


Sample Count:1000, Positive Label Count:[595 405], Positive Class Ratio:[0.595 0.405], Negative Class Ratio = [0.405 0.595]


In [27]:
# double check the label distribution
# Train set
count = 0
sum_positive = 0
for element in train_batched:
    count += len(element[1])
    sum_positive += sum(element[1])

print(f'Sample Count:{count}, Positive Label Count:{sum_positive}, Positive Class Ratio:{sum_positive/count}, Negative Class Ratio = {1-sum_positive/count}')


Sample Count:4000, Positive Label Count:[2397 1603], Positive Class Ratio:[0.59925 0.40075], Negative Class Ratio = [0.40075 0.59925]


### Model Building

In [28]:
class DCNNBERTEmbedding(tf.keras.Model):
    
    def __init__(self,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 name="dcnn"):
        super(DCNNBERTEmbedding, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=False)

        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=5,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=50,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=100,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=2,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def embed_with_bert(self, all_tokens):
        _, embs = self.bert_layer([all_tokens[:, 0, :],
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs

    def call(self, inputs, training):
        x = self.embed_with_bert(inputs)

        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

### Training

In [29]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [30]:
Dcnn = DCNNBERTEmbedding(nb_filters=NB_FILTERS,
                         FFN_units=FFN_UNITS,
                         nb_classes=NB_CLASSES,
                         dropout_rate=DROPOUT_RATE)

In [31]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [32]:
checkpoint_path = "ckpt_BERT_CNN_US_5years/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=2)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [33]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [48]:
# fitting time
Dcnn.fit(train_batched,
         validation_data = (test_batched),
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
    125/Unknown - 4415s 35s/step - loss: 0.9421 - accuracy: 0.5767Checkpoint saved at ckpt_BERT_CNN_US_5years/.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f11520908d0>

In [34]:
# modify from original to_batch in order to not sort to different order because for ensemble we need to preserve the same order among all models
def prediction_to_batch(data_with_len, BATCH_SIZE):
    # data_with_len = sorted(data_with_len, key=lambda x: x[2])
    sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len] 
    
    train_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))
    

    train_batched = train_dataset.padded_batch(BATCH_SIZE, # this is the pre-processed input to feed into BERT embedding layer and DCNN model
                                       padded_shapes=((3, 512), [2]), # make from None to 512 fixed dimension
                                       padding_values=(0, 0)) # drop_remainder = True (optional)
    return train_batched

In [35]:
# prepare the batches for evaluation set:
test_batched_prediction = prediction_to_batch(data_with_len_test, 32)

In [36]:
pred_test = Dcnn.predict(test_batched_prediction)
pred_test

array([[0.84799695, 0.13072672],
       [0.33215982, 0.66788644],
       [0.23617092, 0.7885741 ],
       ...,
       [0.19518188, 0.7800475 ],
       [0.5346638 , 0.48711532],
       [0.8942262 , 0.10214731]], dtype=float32)

In [37]:
np.savetxt('Predict_Output/BERT_CNN_5yr_abstract_title_dev_prob.csv', pred_test)

In [38]:
predicted = [np.argmax(pred) for pred in 
             pred_test]

In [39]:
y_test_binary = df['quality_rank'][4000:].values
y_test_binary

array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,

In [40]:
## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test_binary, predicted)
auc = metrics.roc_auc_score(y_test_binary, predicted)  # predicted_prob), check doc, seems the second argument required to be shape (n_samples,) for binary case 
                            #multi_class="ovr") # check documentation and seems "ovr" not good for only binary target class
print("Accuracy:",  round(accuracy,3))
print("Auc:", round(auc,3))
print("Detail:")
print(metrics.classification_report(y_test_binary, predicted))



Accuracy: 0.603
Auc: 0.534
Detail:
              precision    recall  f1-score   support

           0       0.61      0.90      0.73       595
           1       0.53      0.17      0.26       405

    accuracy                           0.60      1000
   macro avg       0.57      0.53      0.49      1000
weighted avg       0.58      0.60      0.54      1000

