In [2]:
#!pip install transformers
# reference https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794

In [112]:
## for data
import json
import pandas as pd
import numpy as np

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

## for bert language model
import transformers

# for checkpoint
import tensorflow as tf

# string processing
import re

from keras.utils.np_utils import to_categorical

from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection, metrics

In [2]:
# to navigate to the data location
import os

# get current directory 
path = os.getcwd() 

# parent directory
parent = os.path.dirname(path)

In [3]:
df_merge_quality = pd.read_csv(parent + '/data/US_patent_abstract_5000_2015_with_title_1_5y.csv')
df_merge_quality.shape

(5000, 30)

In [4]:
df = df_merge_quality[['claims_text', 'quality_rank']]
df

Unnamed: 0,claims_text,quality_rank
0,What is claimed is: \n \n 1. An invi...,0
1,What is claimed is: \n \n 1. An impl...,0
2,What is claimed is: \n \n 1. A spear...,1
3,1. A computer-implemented method for the autom...,1
4,What is claimed is: \n \n 1. A semic...,0
...,...,...
4995,What is claimed is: \n \n 1. A compu...,1
4996,What is claimed is: \n \n 1. A displ...,1
4997,What is claimed is: \n \n 1. A compo...,1
4998,The invention claimed is: \n \n 1. A...,1


In [5]:
## distil-bert tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

In [6]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [7]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    sentence = re.sub(r"\\", "", sentence)    
    sentence = re.sub(r"\'", "", sentence)    
    sentence = re.sub(r"\"", "", sentence)    
    

    return sentence.strip()

In [8]:
claims = []
sentences = list(df['claims_text'])
for sen in sentences:
    claims.append(preprocess_text(str(sen)))

In [9]:
claims[0]

'What is claimed is An invitation information push method comprising after receiving an invitation request sent by microblog user server sending invitation information to clients corresponding to number of invited users carried in the invitation request wherein the invited users are users who have not registered microblog and the number of the invited users is greater than or equal to and upon receiving the invitation information each client creating an invitation information guide to guide the user who has not registered microblog to register microblog wherein the server sending the invitation information to the clients corresponding to the invited users comprises determining by the server whether one or more of the invited users carried in the invitation request are in restricted list and if none of the invited users carried in the invitation request is in the restricted list sending the invitation information to the clients corresponding to the invited users carried in the invitatio

In [32]:
# organize data for BERT input
def encode_sentence(sent, max_seq_length):
    tok_sent = tokenizer.tokenize(sent)
    length = len(tok_sent)
    if length <= max_seq_length:
        return ["[CLS] "] + tok_sent+ [" [SEP] "] + [" [PAD] "] * (max_seq_length - length) # make sure is ["SEP"] the list * number. Otherwise it doesn't add up to your desired number!
    else: # BERT limited to 512 tokens
        return ["[CLS] "] + tok_sent[:max_seq_length] + [" [SEP] "]

In [54]:
# Generate the input data to feed into BERT
corpus_train = claims[:4000]
maxlen = 510

## add special tokens
# the initial code will have consistency issue if some text are longer than maxlen, and some shorten, and cause error in X_train at the bottom (so I count its unique value => (array([500, 501, 502, 503]), array([ 805,    5,    3, 4187])))
# So I adjust the code to remove the .split(" ") and prcesss use list instead !
# and the process gets more efficient as well!

corpus_tokenized = [encode_sentence(txt, maxlen) for txt in corpus_train]

## generate masks
masks = [[1]*len(txt) + [0]*(maxlen - len(
           txt)) for txt in corpus_tokenized]
    
## padding
txt2seq = [txt + [" [PAD]"]*(maxlen-len(txt)) if len(txt) != maxlen else txt for txt in corpus_tokenized]
    
## generate idx
idx = [tokenizer.convert_tokens_to_ids (seq) for seq in txt2seq]  # I think the problem is here, the tokenizer.encode() seems automatically adding a [CLS] up-front..
#idx = [tokenizer.encode(seq.split(" ")) for seq in txt2seq]    

## generate segments
segments = [] 
for seq in txt2seq:
    temp, i = [], 0
    for token in seq:
        temp.append(i)
        if token == "[SEP]":
             i += 1
    segments.append(temp)
## feature matrix
X_train = [np.asarray(idx, dtype='int32'), 
           np.asarray(masks, dtype='int32'), 
           np.asarray(segments, dtype='int32')]

In [56]:
X_train

[array([[  101,  2054,  2003, ...,  1998,  4773,   102],
        [  101,  2054,  2003, ...,  2029,  2003,   102],
        [  101,  2054,  2003, ...,  1996, 12341,   102],
        ...,
        [  101,  2054,  2003, ...,  2445,  2005,   102],
        [  101,  2057,  4366, ..., 14021, 21332,   102],
        [  101,  1996, 11028, ...,  2011,  2029,   102]], dtype=int32),
 array([[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1]], dtype=int32),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int32)]

In [57]:
np.array(X_train).shape

(3, 4000, 512)

In [40]:
i = 0
print("txt: ", claims[0])
print("tokenized:", [tokenizer.convert_ids_to_tokens(idx) for idx in X_train[0][i].tolist()])
print("idx: ", X_train[0][i])
print("mask: ", X_train[1][i])
print("segment: ", X_train[2][i])

txt:  What is claimed is An invitation information push method comprising after receiving an invitation request sent by microblog user server sending invitation information to clients corresponding to number of invited users carried in the invitation request wherein the invited users are users who have not registered microblog and the number of the invited users is greater than or equal to and upon receiving the invitation information each client creating an invitation information guide to guide the user who has not registered microblog to register microblog wherein the server sending the invitation information to the clients corresponding to the invited users comprises determining by the server whether one or more of the invited users carried in the invitation request are in restricted list and if none of the invited users carried in the invitation request is in the restricted list sending the invitation information to the clients corresponding to the invited users carried in the invi

In [89]:
## inputs
idx = layers.Input((maxlen+2), dtype="int32", name="input_idx")
masks = layers.Input((maxlen+2), dtype="int32", name="input_masks")
## pre-trained bert with config
config = transformers.DistilBertConfig(dropout=0.2, 
           attention_dropout=0.2)
config.output_hidden_states = False
nlp = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)
bert_out = nlp(idx, attention_mask=masks)[0]
## fine-tuning
x = layers.GlobalAveragePooling1D()(bert_out)
x = layers.Dense(64, activation="relu")(x)
y_out = layers.Dense(2, activation='sigmoid')(x)
## compile
model = models.Model([idx, masks], y_out)
for layer in model.layers[:3]:
    layer.trainable = False
model.compile(loss='binary_crossentropy', 
              optimizer='adam', metrics=['accuracy'])
model.summary()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_idx (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_6 (TFDisti ((None, 512, 768),)  66362880    input_idx[0][0]                  
__________________________________________________________________________________________________
global_average_pooling1d_6 (Glo (None, 768)          0           tf_distil_bert_model_6[0][0]     
____________________________________________________________________________________________

In [88]:
# Generate the test input data to feed into BERT
corpus_train = claims[4000:]
maxlen = 510

## add special tokens
# the initial code will have consistency issue if some text are longer than maxlen, and some shorten, and cause error in X_train at the bottom (so I count its unique value => (array([500, 501, 502, 503]), array([ 805,    5,    3, 4187])))
# So I adjust the code to remove the .split(" ") and prcesss use list instead !
# and the process gets more efficient as well!
'''
maxqnans = np.int((maxlen-20)/2)
corpus_tokenized = ["[CLS] "+
             " ".join(tokenizer.tokenize(re.sub(r'[^\w\s]+|\n', '', 
             str(txt).lower().strip()))[:maxlen])+
             " [SEP] " for txt in corpus]  # truncate each claim to the limit maxqnans length
'''

corpus_tokenized = [encode_sentence(txt, maxlen) for txt in corpus_train]

## generate masks
masks = [[1]*len(txt) + [0]*(maxlen - len(
           txt)) for txt in corpus_tokenized]
    
## padding
txt2seq = [txt + [" [PAD]"]*(maxlen-len(txt)) if len(txt) != maxlen else txt for txt in corpus_tokenized]
    
## generate idx
idx = [tokenizer.convert_tokens_to_ids (seq) for seq in txt2seq]  # I think the problem is here, the tokenizer.encode() seems automatically adding a [CLS] up-front..
#idx = [tokenizer.encode(seq.split(" ")) for seq in txt2seq]    

## generate segments
segments = [] 
for seq in txt2seq:
    temp, i = [], 0
    for token in seq:
        temp.append(i)
        if token == "[SEP]":
             i += 1
    segments.append(temp)
## feature matrix
X_test = [np.asarray(idx, dtype='int32'), 
           np.asarray(masks, dtype='int32'), 
           np.asarray(segments, dtype='int32')]

In [82]:
y_train = to_categorical(df['quality_rank'].values[:4000])
y_train

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [87]:
y_test = to_categorical(df['quality_rank'].values[4000:])
y_test

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [90]:
# setup checkpoint

checkpoint_path = "ckpt_distil-bert_embedding/"

ckpt = tf.train.Checkpoint(model = model) # https://www.tensorflow.org/api_docs/python/tf/train/Checkpoint

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_pathath, max_to_keep=2)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [91]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [92]:
## train
training = model.fit(x=X_train, y=y_train, batch_size=64, 
                     epochs=1, validation_data=(X_test, y_test),
                    callbacks=[MyCustomCallback()])



In [93]:
## train more
training = model.fit(x=X_train, y=y_train, batch_size=64, 
                     epochs=1, validation_data=(X_test, y_test),
                    callbacks=[MyCustomCallback()])



In [94]:
## train more
training = model.fit(x=X_train, y=y_train, batch_size=64, 
                     epochs=3, validation_data=(X_test, y_test),
                    callbacks=[MyCustomCallback()])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [102]:
pred_test = model.predict(X_test)

In [103]:
pred_test

array([[0.7312715 , 0.2684852 ],
       [0.7173048 , 0.30848968],
       [0.49267268, 0.5057405 ],
       ...,
       [0.4282417 , 0.56273687],
       [0.5180499 , 0.47388846],
       [0.51664007, 0.46902952]], dtype=float32)

In [104]:
np.savetxt('Distil-BERT_dev_prob.csv', pred_test)

In [105]:
pred_test.shape

(1000, 2)

In [95]:
y_test

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [101]:
1 - sum(df['quality_rank'][4000:].values) / 1000

0.595

In [108]:
predicted = [np.argmax(pred) for pred in 
             pred_test]

In [116]:
print(len(predicted))
predicted = np.array(predicted)

1000


In [111]:
sum(predicted)

156

In [113]:
y_test_binary = df['quality_rank'][4000:].values
y_test_binary

array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,

In [119]:
## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test_binary, predicted)
auc = metrics.roc_auc_score(y_test_binary, predicted)  # predicted_prob), check doc, seems the second argument required to be shape (n_samples,) for binary case 
                            #multi_class="ovr") # check documentation and seems "ovr" not good for only binary target class
print("Accuracy:",  round(accuracy,3))
print("Auc:", round(auc,3))
print("Detail:")
print(metrics.classification_report(y_test_binary, predicted))

# Accuracy output 0.627 => align with model performance result!

Accuracy: 0.627
Auc: 0.564
Detail:
              precision    recall  f1-score   support

           0       0.63      0.90      0.74       595
           1       0.60      0.23      0.34       405

    accuracy                           0.63      1000
   macro avg       0.62      0.56      0.54      1000
weighted avg       0.62      0.63      0.58      1000

