In [1]:
# Import modules
import pandas as pd
import numpy as np
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import  Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tqdm import tqdm
import matplotlib.pyplot as plt

print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
pd.set_option('display.max_colwidth',1000)

TensorFlow Version: 2.1.0
Hub version:  0.8.0


In [2]:
patent_abstract = pd.read_csv('patent_abstract_5000.csv')

In [3]:
df = patent_abstract[['text', 'quality_rank']]

In [4]:
df.head(5)

Unnamed: 0,text,quality_rank
0,"A radiant module (1) comprising: a first hollow body (3) defined by a front wall (301), a rear wall (302), two sides (303), a first top end (304) having a pair of holes (308) made on the corresponding sides (303) and a bottom end (305) having a pair of holes (308) made on the corresponding sides (303); the first hollow body (3) defines a component, in use, for the passage of a heat carrier fluid, a second hollow body (2) or cover casing inside of which is entirely contained the first hollow body (3); the second hollow body (2) has a front wall (201), a rear wall (202), two sides (203), a top (204), having an air discharge section (B) and a pair of holes (4) made on the corresponding sides (203) and a bottom (210) having an air intake section (A) and a pair of holes (4) made on the corresponding sides (203), the second hollow body (2) is configured for generating an inner zone which is able to define a channel (C2) for controlled flow of the air from the intake section (A) of the bo...",0
1,"An array antenna device of this disclosure includes a substrate, a strip conductor with a linear-shape, which is provided on the substrate, and a power feeder that feeds power to the strip conductor, and a plurality of loop elements, a conductor plate, and a plurality of feeding elements. The plurality of loop elements are provided on a first surface of the substrate, and are located along the strip conductor with a specified spacing from each other. Each of the plurality of loop elements has a loop-shape with a notch. The plurality of feeding elements are connected to the strip conductor, and each has a shape extending along a portion of an outer edge of corresponding one of the plurality of loop elements. The conductor plate is provided on a second surface of the substrate.",1
2,A vehicle hood structure capable of improving impact absorption ability prior to a secondary impact when an impacting body impacts the hood. Front end portions 22B of beads 22 of a wave shaped section 20 are arranged in a line in hood plan view. A front wall section 28 is provided further to the hood front side than the wave shaped section 20 and is formed inclined towards the hood bottom side on progression towards the hood front and running substantially along the vehicle width direction. The wave shaped section 20 and the front wall section 28 are connected together by a ledge section 26 formed running substantially along the hood width direction.,0
3,"This invention relates generally to the generation of antibodies, e.g. , monoclonal antibodies including fully human monoclonal antibodies, that recognize Jagged 1 and/or Jagged 2, to antibodies, e.g ., monoclonal antibodies including fully human antibodies that recognize Jagged 1 and/or Jagged 2, and nucleic acid molecules that encode antibodies, e.g. , nucleic acid molecules that encode monoclonal antibodies including fully human cross-reactive antibodies that recognize both Jagged 1 and Jagged 2, and to methods of making the anti- Jagged antibodies and methods of using the anti- Jagged antibodies as therapeutics, prophylactics, and diagnostics. The invention also relates generally to activatable antibodies that include a masking moiety (MM), a cleavable moiety (CM), and an antibody (AB) that specifically bind to Jagged 1 and Jagged 2, and to methods of making and using these activatable anti- Jagged antibodies in a variety of therapeutic, diagnostic and prophylactic indications.",1
4,"Methods and implants to treat anterior cruciate ligament (ACL) injuries are disclosed. The methods involve advancing the insertion of the patellar tendon to the proximal tibia by means of a partial osteotomy and a wedge-shaped cage (30). The wedge-shaped cage is specifically designed to facilitate transfer of not only compressive loads, but also of shear loads due to pull by the patellar tendon at its insertion to the tibial tuberosity. The cage decreases the angle between the patellar tendon and the common tangent plane formed by the condyles of the femur and the condyles of the tibia (sometimes called tibial plateau) and consequently modifies the internal joint force, restoring stability to the joint even if the ACL is ruptured. The methods and implants are applicable to both human and canine patients.",0


In [5]:
df.shape

(5000, 2)

In [6]:
df.apply(lambda x: sum(x.isnull()), axis=0)

text            0
quality_rank    0
dtype: int64

In [7]:
df['quality_rank'].value_counts()

0    2937
1    2063
Name: quality_rank, dtype: int64

In [8]:
2937/5000

0.5874

In [9]:
# Functions for constructing BERT Embeddings: input_ids, input_masks, input_segments and Inputs
MAX_SEQ_LEN=500 # max sequence length

def get_masks(tokens):
    """Masks: 1 for real tokens and 0 for paddings"""
    return [1]*len(tokens) + [0] * (MAX_SEQ_LEN - len(tokens))
 
def get_segments(tokens):
    """Segments: 0 for the first sequence, 1 for the second"""  
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (MAX_SEQ_LEN - len(tokens))

def get_ids(tokens, tokenizer):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (MAX_SEQ_LEN - len(token_ids))
    return input_ids

def create_single_input(sentence, tokenizer, max_len):
    """Create an input from a sentence"""
    stokens = tokenizer.tokenize(sentence)
    stokens = stokens[:max_len]
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
    ids = get_ids(stokens, tokenizer)
    masks = get_masks(stokens)
    segments = get_segments(stokens)

    return ids, masks, segments
 
def convert_sentences_to_features(sentences, tokenizer):
    """Convert sentences to features: input_ids, input_masks and input_segments"""
    input_ids, input_masks, input_segments = [], [], []
 
    for sentence in tqdm(sentences,position=0, leave=True):
      ids,masks,segments=create_single_input(sentence,tokenizer,MAX_SEQ_LEN-2)
      assert len(ids) == MAX_SEQ_LEN
      assert len(masks) == MAX_SEQ_LEN
      assert len(segments) == MAX_SEQ_LEN
      input_ids.append(ids)
      input_masks.append(masks)
      input_segments.append(segments)

    return [np.asarray(input_ids, dtype=np.int32), 
          np.asarray(input_masks, dtype=np.int32), 
          np.asarray(input_segments, dtype=np.int32)]

def create_tonkenizer(bert_layer):
    """Instantiate Tokenizer with vocab"""
    vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case=bert_layer.resolved_object.do_lower_case.numpy() 
    tokenizer=bert.bert_tokenization.FullTokenizer(vocab_file,do_lower_case)
    return tokenizer

In [10]:
def bert_model(callable_object):
    # Load the pre-trained BERT base model
    bert_layer = hub.KerasLayer(handle=callable_object, trainable=True)  
   
    # BERT layer three inputs: ids, masks and segments
    input_ids = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_ids")           
    input_masks = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_masks")       
    input_segments = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="segment_ids")
    
    inputs = [input_ids, input_masks, input_segments] # BERT inputs
    pooled_output, sequence_output = bert_layer(inputs) # BERT outputs
    
    # Add a hidden layer
    x = Dense(units=768, activation='relu')(pooled_output)
    x = Dropout(0.1)(x)
 
    # Add output layer
    outputs = Dense(2, activation="softmax")(x)

    # Construct a new model
    model = Model(inputs=inputs, outputs=outputs)
    return model

model = bert_model("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1")
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 500)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 500)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 500)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_ids[0][0]                  
                                                                 input_masks[0][0]            

In [11]:
# Create examples for training and testing
df = df.sample(frac=1) # Shuffle the dataset
tokenizer = create_tonkenizer(model.layers[3])
X_train = convert_sentences_to_features(df['text'][:4000], tokenizer)
X_test = convert_sentences_to_features(df['text'][4000:], tokenizer)

one_hot_encoded = to_categorical(df['quality_rank'].values)
y_train = one_hot_encoded[:4000]
y_test =  one_hot_encoded[4000:]

100%|██████████| 4000/4000 [00:08<00:00, 454.81it/s]
100%|██████████| 1000/1000 [00:02<00:00, 481.70it/s]


In [12]:
model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["acc"])

In [12]:
#df = df.sample(frac=1) # Shuffle the dataset
#tokenizer = create_tonkenizer(model.layers[3])
#X_train = convert_sentences_to_features(df['text'][:4000], tokenizer)
#X_test = convert_sentences_to_features(df['text'][4000:], tokenizer)

100%|██████████| 4000/4000 [00:08<00:00, 476.72it/s]
100%|██████████| 1000/1000 [00:01<00:00, 501.51it/s]


In [13]:
#one_hot_encoded = to_categorical(df['quality_rank'].values)
#y_train = one_hot_encoded[:4000]
#y_test =  one_hot_encoded[4000:]

In [None]:
# Train the model
BATCH_SIZE = 8
EPOCHS = 1

# Use Adam optimizer to minimize the categorical_crossentropy loss
opt = Adam(learning_rate=2e-5)
model.compile(optimizer=opt, 
              loss='categorical_crossentropy', 
              metrics=['acc'])

# Fit the data to the model
history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    verbose = 1)

# Save the trained model
model.save('bert_model.h5')

Train on 4000 samples, validate on 1000 samples


In [None]:
#model.fit(X_train, y_train, validation_data=(X_test, y_test),
#          epochs=10, batch_size=50)

Train on 4000 samples, validate on 1000 samples
Epoch 1/10
