In [1]:
# Import modules
import pandas as pd
import numpy as np
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import  Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tqdm import tqdm
import matplotlib.pyplot as plt

print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
pd.set_option('display.max_colwidth',1000)

TensorFlow Version: 2.1.0
Hub version:  0.8.0


In [2]:
MAX_SENT_LENGTH = 30
MAX_SENTS = 20
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [3]:
patent_abstract = pd.read_csv('US_patent_abstract_5000_2015_with_title_1.csv')

In [4]:
df = patent_abstract[['claims_text', 'quality_rank']]

In [5]:
df.head(5)

Unnamed: 0,claims_text,quality_rank
0,"1. A device comprising:\n memory to store instructions, where the device is a first device; and \n a processor to execute the instructions to:\n receive a first message and a second message from a second device, where the first message and the second message differ, and where the first message includes a first header and first event data and the second message includes a second header and second event data, \n identify, based on the first header and the second header, that the first message and the second messages are problem reports, \n process, in response to identifying the first message and the second messages as the problem reports, the first event data to determine that the first event data, in the first message, is associated with a particular reconfiguration information, of a plurality of reconfiguration information corresponding with the second device, \n process, in response to identifying the first message and the second messages as the problem reports, the second event ...",0
1,"The invention claimed is: \n \n 1. A control system for an elevator, comprising:\n a sensor to sense a parameter of the elevator, when the elevator is moving; \n a calculator used to calculate a speed pattern to set a speed of the elevator, using information from the sensor obtained when the elevator is moving; \n a memory to store the speed pattern which sets a speed of the elevator; and \n a motor controller which controls movement of the elevator using the speed pattern stored in the memory. \n \n \n \n 2. A control system for an elevator according to claim 1 , wherein the speed pattern comprises a pattern of a velocity or a pattern of an acceleration of the elevator. \n \n \n 3. A control system for an elevator according to claim 1 , wherein the calculator calculates the speed pattern using the information from the sensor which was obtained while a carrying load state of a car is changed in at least two ways when the elevator is installed...",1
2,"What is claimed is: \n \n 1. An image forming apparatus comprising:\n an image forming unit configured to form an image on a photosensitive member that is rotated by being driven by a motor; \n a detection unit configured to detect density information when a plurality of test patches formed by the image forming unit is irradiated with light, wherein the plurality of test patches is formed based on image data, and the image data is used for forming the plurality of test patches having density unevenness in a predetermined period corresponding to density unevenness occurring due to rotational unevenness of the motor in the predetermined period in a sub-scanning direction of the image and being different in start phase of the density unevenness in the predetermined period; and \n a control unit configured to cause the image forming unit to form an image with density being corrected based on detection results of detecting the density information of the plurality of test patch...",0
3,"The invention claimed is: \n \n 1. A switching power supply device having an input power supply and a load, comprising:\n a switching element disposed between the input power supply and the load; \n an inductive circuit coupled to the input power supply, supplying a current to the switching element; \n a power supply control integrated circuit for controlling the switching element to apply a constant output voltage to the load, the power supply control integrated circuit including\n a control circuit that controls operations of the power supply control integrated circuit, \n a drive circuit that, under the control of the control circuit, drives to turn on or off the switching element, \n a condition setting circuit that, under the control of the control circuit, determines an operation condition of the switching power supply device, and outputs a condition indicating signal indicating the operation condition, \n a control signal output terminal for outputting a drive sign...",1
4,"What is claimed is: \n \n 1. A computer system including instructions stored on a non-transitory computer-readable storage medium, the computer system comprising:\n a processor configured to: \n define, at a shadow system, a shadow program database including data and a structure collectively defining an upgrade to an original program database on an original system; \n define, at the shadow system, a shadow basis database including information related to start-up functions of the shadow system,\n the start-up functions including transitioning the shadow system from a downstate to an upstate, \n the information related to start-up functions including data related to at least one service configured to enable basic functionality of the shadow system, \n the shadow basis database is defined in shadow system during an upstate of the shadow system and an upstate of the original system, \n the shadow basis database is defined based on an original basis database associated with th...",0


In [6]:
df.apply(lambda x: sum(x.isnull()), axis=0)

claims_text     0
quality_rank    0
dtype: int64

In [7]:
df['quality_rank'].value_counts()

0    2896
1    2104
Name: quality_rank, dtype: int64

In [8]:
2896/5000

0.5792

In [31]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/nobu_yamaguchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

from nltk import tokenize

claims = []
labels = []
texts = []

for idx in range(df.claims_text.shape[0]):

    text = clean_str(str(df.iloc[idx]['claims_text']))
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    claims.append(sentences)
    labels.append(df.iloc[idx]['quality_rank'])

In [36]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

In [37]:
for i, sentences in enumerate(claims):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
            #for word in wordTokens:
                if word in tokenizer.word_index.keys():
                    if (k < MAX_SENT_LENGTH) and (tokenizer.word_index[word] < MAX_NB_WORDS):
                        data[i, j, k] = tokenizer.word_index[word]
                        k = k + 1

In [38]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 31722 unique tokens.


In [39]:
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (5000, 20, 30)
Shape of label tensor: (5000, 2)


In [40]:
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Number of positive and negative reviews in traing and validation set
[2320. 1680.]
[576. 424.]


In [30]:
hub_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", input_shape=[], 
                           dtype=tf.string, trainable=True)

In [None]:
# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=False)

In [13]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import bert

Using TensorFlow backend.


In [20]:
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

In [14]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [50]:
#bert = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
#                            trainable=False)

In [15]:
def tokenize_claims(text_claims):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_claims))

In [21]:
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_att = AttLayer(100)(l_lstm)
sentEncoder = Model(sentence_input, l_att)

NameError: name 'embedding_layer' is not defined

In [18]:
tokenized_claims = [tokenize_claims(claim) for claim in claims]

In [None]:
input_layer = Input(shape=(1,), dtype="string", name="Input_layer")
embedding_layer = Lambda(ELMoEmbedding, output_shape=(1024, ), name="BERT_Embedding")(input_layer)
BiLSTM = Bidirectional(layers.LSTM(1024, return_sequences= False, recurrent_dropout=0.2, dropout=0.2), name="BiLSTM")(embedding_layer)
Dense_layer_1 = Dense(8336, activation='relu')(BiLSTM)
Dropout_layer_1 = Dropout(0.5)(Dense_layer_1)
Dense_layer_2 = Dense(4168, activation='relu')(Dropout_layer_1)
Dropout_layer_2 = Dropout(0.5)(Dense_layer_2)
output_layer = Dense(1, activation='sigmoid')(Dropout_layer_2)
model = Model(inputs=[input_layer], outputs=output_layer, name="BiLSTM with ELMo Embeddings")
model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])

In [41]:
embedding_layer = Embedding(5000,
                            EMBEDDING_DIM,
                            weights=[tokenized_claims],
                            input_length=MAX_SENT_LENGTH,
                            trainable=False)

In [42]:
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')

In [43]:
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_att = AttLayer(100)(l_lstm)
sentEncoder = Model(sentence_input, l_att)

AttributeError: 'list' object has no attribute 'shape'

In [90]:
data = np.zeros((len(tokenized_claims), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

In [76]:
data.shape

(5000, 20, 30)

In [77]:
data[1]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [57]:
def build_model():
    input_layer = Input(shape=(1,), dtype="string", name="Input_layer")
    embedding_layer = Lambda(BertEmbedding, output_shape=(1024, ), name="Bert_Embedding")(input_layer)
    BiLSTM = Bidirectional(layers.LSTM(1024, return_sequences= False, recurrent_dropout=0.2, dropout=0.2), name="BiLSTM")(embedding_layer)
    Dense_layer_1 = Dense(8336, activation='relu')(BiLSTM)
    Dropout_layer_1 = Dropout(0.5)(Dense_layer_1)
    Dense_layer_2 = Dense(4168, activation='relu')(Dropout_layer_1)
    Dropout_layer_2 = Dropout(0.5)(Dense_layer_2)
    output_layer = Dense(1, activation='sigmoid')(Dropout_layer_2)
    model = Model(inputs=[input_layer], outputs=output_layer, name="BiLSTM with BERT Embeddings")
    model.summary()
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model
bert_BiDirectional_model = build_model()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Could not find matching function to call loaded from the SavedModel. Got:
  Positional arguments (3 total):
    * Tensor("inputs:0", shape=(None,), dtype=string)
    * False
    * None
  Keyword arguments: {}

Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (3 total):
    * [TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/0'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/1'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/2')]
    * False
    * None
  Keyword arguments: {}

Option 2:
  Positional arguments (3 total):
    * [TensorSpec(shape=(None, None), dtype=tf.int32, name='input_word_ids'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_mask'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_type_ids')]


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Could not find matching function to call loaded from the SavedModel. Got:
  Positional arguments (3 total):
    * Tensor("inputs:0", shape=(None,), dtype=string)
    * False
    * None
  Keyword arguments: {}

Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (3 total):
    * [TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/0'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/1'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/2')]
    * False
    * None
  Keyword arguments: {}

Option 2:
  Positional arguments (3 total):
    * [TensorSpec(shape=(None, None), dtype=tf.int32, name='input_word_ids'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_mask'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_type_ids')]


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Could not find matching function to call loaded from the SavedModel. Got:
  Positional arguments (3 total):
    * Tensor("inputs:0", shape=(None,), dtype=string)
    * False
    * None
  Keyword arguments: {}

Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (3 total):
    * [TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/0'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/1'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/2')]
    * False
    * None
  Keyword arguments: {}

Option 2:
  Positional arguments (3 total):
    * [TensorSpec(shape=(None, None), dtype=tf.int32, name='input_word_ids'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_mask'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_type_ids')]


ValueError: in converted code:

    /home/nobu_yamaguchi/anaconda3/lib/python3.7/site-packages/tensorflow_hub/keras_layer.py:229 call  *
        result = smart_cond.smart_cond(training,
    /home/nobu_yamaguchi/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/smart_cond.py:56 smart_cond
        return false_fn()
    /home/nobu_yamaguchi/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/saved_model/load.py:438 _call_attribute
        return instance.__call__(*args, **kwargs)
    /home/nobu_yamaguchi/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py:568 __call__
        result = self._call(*args, **kwds)
    /home/nobu_yamaguchi/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py:606 _call
        results = self._stateful_fn(*args, **kwds)
    /home/nobu_yamaguchi/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py:2362 __call__
        graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
    /home/nobu_yamaguchi/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py:2703 _maybe_define_function
        graph_function = self._create_graph_function(args, kwargs)
    /home/nobu_yamaguchi/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py:2593 _create_graph_function
        capture_by_value=self._capture_by_value),
    /home/nobu_yamaguchi/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py:978 func_graph_from_py_func
        func_outputs = python_func(*func_args, **func_kwargs)
    /home/nobu_yamaguchi/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py:439 wrapped_fn
        return weak_wrapped_fn().__wrapped__(*args, **kwds)
    /home/nobu_yamaguchi/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/saved_model/function_deserialization.py:262 restored_function_body
        "\n\n".join(signature_descriptions)))

    ValueError: Could not find matching function to call loaded from the SavedModel. Got:
      Positional arguments (3 total):
        * Tensor("inputs:0", shape=(None,), dtype=string)
        * False
        * None
      Keyword arguments: {}
    
    Expected these arguments to match one of the following 4 option(s):
    
    Option 1:
      Positional arguments (3 total):
        * [TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/0'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/1'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/2')]
        * False
        * None
      Keyword arguments: {}
    
    Option 2:
      Positional arguments (3 total):
        * [TensorSpec(shape=(None, None), dtype=tf.int32, name='input_word_ids'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_mask'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_type_ids')]
        * True
        * None
      Keyword arguments: {}
    
    Option 3:
      Positional arguments (3 total):
        * [TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/0'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/1'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/2')]
        * True
        * None
      Keyword arguments: {}
    
    Option 4:
      Positional arguments (3 total):
        * [TensorSpec(shape=(None, None), dtype=tf.int32, name='input_word_ids'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_mask'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_type_ids')]
        * False
        * None
      Keyword arguments: {}
