In [1]:
# Import modules
import pandas as pd
import numpy as np
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import  Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tqdm import tqdm
import matplotlib.pyplot as plt

# string processing
import re
from keras.utils.np_utils import to_categorical
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection, metrics

print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
pd.set_option('display.max_colwidth',1000)

TensorFlow Version: 2.2.0
Hub version:  0.8.0


### Start here:

In [2]:
# to navigate to the data location
import os

# get current directory 
path = os.getcwd() 

# parent directory
parent = os.path.dirname(path)

df_merge_quality = pd.read_csv(parent + '/data/US_patent_abstract_5000_2015_with_title_1_5y.csv')
df_merge_quality.shape

(5000, 30)

In [3]:
df = df_merge_quality[['text', 'quality_rank']]
df

Unnamed: 0,text,quality_rank
0,"Invitation information push method and system. An invitation information push method includes after receiving an invitation request sent by a microblog user, a server sending invitation information to a number of clients corresponding to invited users carried in the invitation request, wherein the invited users are users who have not registered microblog, and the number of the invited users N is greater than or equal to 1. Each client, upon receiving the invitation information, creating an invitation information guide to guide the users who have not registered the microblog to register the microblog. The method further comprises, when a predetermined time is reached, a server actively sending invitation information to at least one client corresponding to at least one user who has not registered the microblog.",0
1,Coronal angulating connector. A connector is provided for linear implants such as spinal rods which are disposed within the coronal plane of a body. The connector includes a first portion having a first cavity for disposal therethrough of a first spinal rod. A second portion has a second cavity for the disposal therethrough of a second spinal rod. The second portion is rotatable relative to the first portion. Methods of use are disclosed.,0
2,"Spearfishing apparatus. A device for spearfishing, which device may include a barrel including a first end and a second end, the barrel defining a spear conduit and two band conduits, the spear conduit and the two band conduits extending with the barrel from the first end to the second end. The spear conduit may be coaxial with the longitudinal axis of the barrel. The spear conduit may be located between the two band conduits, and each of the band conduits may be parallel to the spear conduit. The barrel may define a recessed groove, which may extend from the first end of the barrel to the second end of the barrel. The device may further include a handle or other accessories that are releasably engaged with the recessed groove. The device may further include band attachment means, such as a cone-shaped plug, for releasably coupling a band to the barrel.",1
3,"Systems and methods for prioritizing media files in a presentation device. Disclosed are embodiments of systems and methods for prioritizing mobile media player files by providing for the automated addition and/or deletion of media items for a mobile media player. In some embodiments, a statistical method may be provided for inferring which media items on a mobile media player should be deleted based on, for example, user taste data. In some embodiments, new media items may be loaded onto a user&#39;s mobile media player by creating one or more playlists from a playlist builder. The playlist(s) may be created by using user taste data. Rankings may also be created to determine an order for deletion of the media items currently on a mobile media player and/or for addition of new media items to the device.",1
4,Semiconductor integrated circuit. A semiconductor integrated circuit comprises a state holding circuit that inputs an output of one inverter to another inverter with each other; an input circuit that causes a state of the state holding circuit to transition based on a data signal; a first first-conductive transistor that is inserted between an input of the one inverter and an output of the another inverter and is controlled by the data signal; and a first second-conductive transistor that is connected in parallel with the first first-conductive transistor and is controlled by the data signal.,0
...,...,...
4995,"Cross-platform cloud-based map creation. Methods, systems and articles of manufacture for cross-platform cloud based map creation are described herein. A method embodiment includes receiving one or more GIS datasets in their respective source formats, each GIS dataset comprising one or more map assets, translating the GIS datasets into a platform independent format, and providing the translated GIS datasets and their respective map assets in the platform independent format to one or more client platforms. The embodiment further includes identifying missing map assets and metadata that comprise a GIS dataset, tracking map assets that comprise a single translated GIS dataset to maintain the translated GIS dataset as a single entity, and providing the translated GIS datasets to a serving system.",1
4996,"Display substrate. A display substrate includes a gate line extended in one direction of a base substrate, a first data line extended in a direction crossing the gate line, a transverse storage line extended in the extending direction of the gate line and crossing the first data line, a longitudinal storage line extended in the extending direction of the first data line and crossing the transverse storage line, a portion of an overlapping area between the longitudinal storage line and the transverse storage line is exposed in a contact part region having an opening partially exposing the transverse storage line. A contact electrode covers the contact part opening and makes electrical contact with each of the transverse storage line and the longitudinal storage line.",1
4997,"Aminoquinazoline derivatives and their salts and methods of use. The present invention relates to the field of medicine. Provided herein are aminoquinazoline derivatives, their salts and pharmaceutical formulations useful in modulating the protein tyrosine kinase activity, and in modulating inter- and/or intra-cellular signaling. Also provided herein are pharmaceutically acceptable compositions comprising the aminoquinazoline compounds and methods of using the compositions in the treatment of hyperproliferative disorders in mammals, especially humans.",1
4998,"Method and device for displaying information in a vehicle. A method for displaying graphical objects. The graphical objects can be displayed by a control device of a display device in a first or second display mode. The speed of the vehicle is measured, and in a first speed range the graphical objects are displayed in the first display mode and in a second speed range the graphical objects are displayed in the second display mode. Also disclosed is a device for displaying information of graphical objects.",1


In [4]:
df['text'] = df['text'].apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [5]:
df.apply(lambda x: sum(x.isnull()), axis=0)

text            0
quality_rank    0
dtype: int64

In [6]:
# Functions for constructing BERT Embeddings: input_ids, input_masks, input_segments and Inputs
MAX_SEQ_LEN=512 # max sequence length

def get_masks(tokens):
    """Masks: 1 for real tokens and 0 for paddings"""
    return [1]*len(tokens) + [0] * (MAX_SEQ_LEN - len(tokens))
 
def get_segments(tokens):
    """Segments: 0 for the first sequence, 1 for the second"""  
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (MAX_SEQ_LEN - len(tokens))

def get_ids(tokens, tokenizer):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (MAX_SEQ_LEN - len(token_ids))
    return input_ids

def create_single_input(sentence, tokenizer, max_len):
    """Create an input from a sentence"""
    stokens = tokenizer.tokenize(sentence)
    stokens = stokens[:max_len]
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
    ids = get_ids(stokens, tokenizer)
    masks = get_masks(stokens)
    segments = get_segments(stokens)

    return ids, masks, segments
 
def convert_sentences_to_features(sentences, tokenizer):
    """Convert sentences to features: input_ids, input_masks and input_segments"""
    input_ids, input_masks, input_segments = [], [], []
 
    for sentence in tqdm(sentences,position=0, leave=True):
      ids,masks,segments=create_single_input(sentence,tokenizer,MAX_SEQ_LEN-2)
      assert len(ids) == MAX_SEQ_LEN
      assert len(masks) == MAX_SEQ_LEN
      assert len(segments) == MAX_SEQ_LEN
      input_ids.append(ids)
      input_masks.append(masks)
      input_segments.append(segments)

    return [np.asarray(input_ids, dtype=np.int32), 
          np.asarray(input_masks, dtype=np.int32), 
          np.asarray(input_segments, dtype=np.int32)]

def create_tonkenizer(bert_layer):
    """Instantiate Tokenizer with vocab"""
    vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case=bert_layer.resolved_object.do_lower_case.numpy() 
    tokenizer=bert.bert_tokenization.FullTokenizer(vocab_file,do_lower_case)
    return tokenizer

In [7]:
def bert_model(callable_object):
    # Load the pre-trained BERT base model
    bert_layer = hub.KerasLayer(handle=callable_object, trainable=True)  
   
    # BERT layer three inputs: ids, masks and segments
    input_ids = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_ids")           
    input_masks = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_masks")       
    input_segments = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="segment_ids")
    
    inputs = [input_ids, input_masks, input_segments] # BERT inputs
    pooled_output, sequence_output = bert_layer(inputs) # BERT outputs
    
    # Add a hidden layer
    x = Dense(units=768, activation='relu')(pooled_output)
    x = Dropout(0.1)(x)
 
    # Add output layer
    outputs = Dense(2, activation="sigmoid")(x)

    # Construct a new model
    model = Model(inputs=inputs, outputs=outputs)
    return model

model = bert_model("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1")
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_ids[0][0]                  
                                                                 input_masks[0][0]            

In [8]:
df['quality_rank'] = df['quality_rank'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [9]:
# Create examples for training and testing
# df = df.sample(frac=1) # Shuffle the dataset  # already shuffle in the csv file
tokenizer = create_tonkenizer(model.layers[3])
X_train = convert_sentences_to_features(df['text'][:4000], tokenizer)
X_test = convert_sentences_to_features(df['text'][4000:], tokenizer)


# for binary classification, since in the loss function I use binary_crossentropy, looks like prefer this single array
y_train = to_categorical(df['quality_rank'][:4000].values)
y_test =  to_categorical(df['quality_rank'][4000:].values)




100%|██████████| 4000/4000 [00:08<00:00, 499.00it/s]
100%|██████████| 1000/1000 [00:01<00:00, 507.58it/s]


In [10]:
# distribution of label value in train
np.unique([np.argmax(i) for i in y_train], return_counts=True)

(array([0, 1]), array([2397, 1603]))

In [11]:
# distribution of label value in test
np.unique([np.argmax(i) for i in y_test], return_counts=True)

(array([0, 1]), array([595, 405]))

In [12]:
# setup checkpoint

checkpoint_path = "ckpt_bert_fine_tune/"

ckpt = tf.train.Checkpoint(model = model) # https://www.tensorflow.org/api_docs/python/tf/train/Checkpoint

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=2)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [13]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [14]:
# Train the model
BATCH_SIZE = 10
EPOCHS = 3

# Use Adam optimizer to minimize the categorical_crossentropy loss
opt = Adam(learning_rate=2e-5)
model.compile(optimizer=opt, 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# Fit the data to the model
history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    verbose = 1,
                    callbacks=[MyCustomCallback()])


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [15]:
pred_test = model.predict(X_test)
pred_test

array([[0.18044928, 0.8164135 ],
       [0.7246722 , 0.25552952],
       [0.40924984, 0.57786816],
       ...,
       [0.18092331, 0.7848031 ],
       [0.1933789 , 0.8162036 ],
       [0.3561105 , 0.6628097 ]], dtype=float32)

In [16]:
np.savetxt('Predict_Output/BERT_fine_tune_5yr_abstract_title_dev_prob.csv', pred_test)

In [17]:
predicted = [np.argmax(pred) for pred in 
             pred_test]

In [18]:
y_test_binary = df['quality_rank'][4000:].values
y_test_binary

array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,

In [19]:
## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test_binary, predicted)
auc = metrics.roc_auc_score(y_test_binary, predicted)  # predicted_prob), check doc, seems the second argument required to be shape (n_samples,) for binary case 
                            #multi_class="ovr") # check documentation and seems "ovr" not good for only binary target class
print("Accuracy:",  round(accuracy,3))
print("Auc:", round(auc,3))
print("Detail:")
print(metrics.classification_report(y_test_binary, predicted))

# Accuracy output 0.627 => align with model performance result!

Accuracy: 0.616
Auc: 0.576
Detail:
              precision    recall  f1-score   support

           0       0.65      0.79      0.71       595
           1       0.54      0.36      0.43       405

    accuracy                           0.62      1000
   macro avg       0.59      0.58      0.57      1000
weighted avg       0.60      0.62      0.60      1000



In [20]:
# To experiment with one more epoch
# => overfitting, not used
history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=1,
                    batch_size=BATCH_SIZE,
                    verbose = 1,
                    callbacks=[MyCustomCallback()])

