In [2]:
import pandas as pd
import json
import os

### Building tokenizer

In [3]:
FILE = 'patents.json'
patents_data = pd.read_json(FILE)
title = [p['patent_title']for p in patents_data['patents']]
abstract =  [p['patent_abstract'] for p in patents_data['patents']]

In [4]:
titles_none_removed = []
abstract_none_removed = []
removed_titles = []
for t, a in zip(title, abstract):
    try:
        if a != None:
            abstract_none_removed.append(a)
            titles_none_removed.append(t)
    except:
        removed_titles.append((t, a))

In [7]:
titles_none_removed[0]

'"Clarification of black ammonium polyphosphate liquids--recycling of byproduct ""tops"""'

In [8]:
abstract_none_removed[0]

'The process allows essentially all of the nitrogen and P.sub.2 O.sub.5 values in the treated black ammonium polyphosphate liquid to be recovered in the form of valuable clarified product. In the process, a heel of black liquid fertilizer is first clarified by a prior-art procedure using flocculants. The improvement over the prior art picks up with the byproduct tops which is then diluted with the water of formulation required to dissolve additional amounts of ammonium polyphosphate melt. This diluted liquid is filtered, the filter cake containing upwards of 99 percent of the undesirable black carbonaceous material is discarded, and the clear filtrate, which contains essentially all of the nitrogen and P.sub.2 O.sub.5 originally in the tops, is then used in lieu of the prior-art water of formulation to dissolve additional ammonium polyphosphate melt to produce more black ammonium polyphosphate liquid.'

In [10]:
from keras_preprocessing.text import Tokenizer

In [11]:
filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
tokenizer = Tokenizer(filters=filters, lower=True, split=' ', num_words=None, char_level=False)
#fitting tokenizer and transforming that to tokens
tokenizer.fit_on_texts(abstract_none_removed)
sequences = tokenizer.texts_to_sequences(abstract_none_removed)

In [12]:
sequences[0][:10]

[1, 18, 548, 765, 309, 3, 1, 393, 4, 1032]

In [13]:
' '.join(tokenizer.index_word[ind] for ind in sequences[0][:10])

'the process allows essentially all of the nitrogen and p'

In [14]:
abstract_none_removed[0][:100]

'The process allows essentially all of the nitrogen and P.sub.2 O.sub.5 values in the treated black a'

In [15]:
filters_punct = '!"#$%&()*+-/<=>?@[\\]^_`{|}~\t\n'

In [16]:
tokenizer_punct = Tokenizer(filters=filters_punct, lower=False, split=' ', num_words=None, char_level=False)

In [17]:
abstract_part = abstract_none_removed[:4000]
title_part = titles_none_removed[:4000]

In [18]:
#fitting tokenizer and transforming that to tokens - different filter
tokenizer_punct.fit_on_texts(abstract_part)
sequences = tokenizer_punct.texts_to_sequences(abstract_part)

In [19]:
sequences[0][:10]

[9, 35, 420, 838, 332, 3, 1, 688, 4, 10492]

In [20]:
' '.join(tokenizer_punct.index_word[ind] for ind in sequences[0][:10])

'The process allows essentially all of the nitrogen and P.sub.2'

### Building features and labels

In [22]:
import numpy as np

In [23]:
features = []
labels = []
training_size = 50
#iterating thru the sequences 
for seq in sequences:
    for i in range(training_size, len(seq)):
        #in case 50 then taking seq[0:51]
        extract = seq[i-training_size: i+1]
        #features --> seq[0:50]
        features.append(extract[:-1])
        #labels --> seq[51]
        labels.append(extract[-1])
#transforming to numpy array
features = np.array(features)

In [24]:
features[0]

array([    9,    35,   420,   838,   332,     3,     1,   688,     4,
       10492, 10493,   973,     8,     1,   377,   888,  1561,  7414,
          31,     5,    20,   232,     8,     1,    72,     3,  1801,
        2701,   800,    68,     1,   455,     2, 14184,     3,   888,
          31,  4002,     6,    26,  2701,    15,     2,   406,  2498,
        2343,    97, 14185,     9,   801])

In [25]:
' '.join(tokenizer_punct.index_word[ind] for ind in features[0])

'The process allows essentially all of the nitrogen and P.sub.2 O.sub.5 values in the treated black ammonium polyphosphate liquid to be recovered in the form of valuable clarified product. In the process, a heel of black liquid fertilizer is first clarified by a prior art procedure using flocculants. The improvement'

In [26]:
labels[0], tokenizer_punct.index_word[labels[0]]

(254, 'over')

In [27]:
features.shape

(324747, 50)

### building one_hot vector

In [28]:
num_words = len(tokenizer_punct.index_word) + 1
print(num_words)

22896


In [29]:
#creating labels vector
labels_vector = np.zeros(shape=(len(features), num_words), dtype=np.int8)

for ex_index, word_index in enumerate(labels):
    labels_vector[ex_index, word_index] = 1

In [30]:
tokenizer_punct.index_word[np.argmax(labels_vector[0])]

'over'

### RNN

In [31]:
EMBEDDINGS_FILE = 'glove.6B.100d.txt'
glove = np.loadtxt(EMBEDDINGS_FILE, dtype=str, comments=None)
vectors = glove[:, 1:].astype(float)
words = glove[:, 0]
word_lookup = {word: vector for word, vector in zip(words, vectors)}

In [32]:
word_lookup['the']

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [33]:
embedding_matrix = np.zeros(shape=(num_words, vectors.shape[1]))

In [34]:
for i in tokenizer_punct.index_word:
    vector = word_lookup.get(tokenizer_punct.index_word.get(i), None)
    if vector is not None:
        embedding_matrix[i, :] = vector

In [35]:
embedding_matrix[1]

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [36]:
tokenizer_punct.index_word.get(1)

'the'

In [37]:
word_lookup['the']

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [39]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

In [40]:
#identifying model:

model = Sequential()

model.add(Embedding(input_dim=num_words, 
                    input_length=training_size, 
                    weights=[embedding_matrix],
                    output_dim=100,
                    trainable=False,
                    mask_zero=True))

model.add(Masking(mask_value=0.0))
model.add(LSTM(64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_words, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [41]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           2289600   
_________________________________________________________________
masking_1 (Masking)          (None, 50, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 22896)             1488240   
Total params: 3,824,240
Trainable params: 1,534,640
Non-trainable params: 2,289,600
__________________________________________________________

In [42]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint('./models/model.h5', save_best_only=True, save_weights_only=False)]

In [43]:
features.shape, labels_vector.shape

((324747, 50), (324747, 22896))

In [44]:
X_train, y_train = features[:100000], labels_vector[:100000]
X_valid, y_valid = features[100000:110000], labels_vector[100000:110000]

In [45]:
history = model.fit(X_train,  y_train, 
                    batch_size=2048, epochs=150,
                    callbacks=callbacks,
                    validation_data=(X_valid, y_valid))

Train on 100000 samples, validate on 10000 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150


Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150


In [46]:
from keras.models import load_model
model = load_model('./models/model.h5')

In [47]:
X_valid.shape

(10000, 50)

In [48]:
y_valid.shape

(10000, 22896)

In [49]:
model.evaluate(X_valid, y_valid)



[6.553760202026367, 0.1452]

In [77]:
X = ['The process allows essentially all of the nitrogen and P.sub.2 O.sub.5 values in the treated black ammonium polyphosphate liquid to be recovered in the form of valuable clarified product. In the process, a heel of black liquid fertilizer is first clarified by a prior-art procedure using flocculants. The improvement over the prior art picks up with the byproduct tops which is then diluted with the water of formulation required to dissolve additional amounts of ammonium polyphosphate melt. This diluted liquid is filtered']

In [149]:
transformed = np.array(tokenizer_punct.texts_to_sequences(X)[0][30:80]).reshape(1, -1)

In [150]:
transformed.shape

(1, 50)

In [151]:
model.predict_classes(transformed)

array([4])

In [152]:
' '.join(tokenizer_punct.index_word[ind] for ind in transformed[0][:50])

'the process, a heel of black liquid fertilizer is first clarified by a prior art procedure using flocculants. The improvement over the prior art picks up with the byproduct tops which is then diluted with the water of formulation required to dissolve additional amounts of ammonium polyphosphate melt. This diluted'

In [105]:
tokenizer_punct.index_word[3]

'of'

In [113]:
np.append(transformed,3)

array([    9,    35,   420,   838,   332,     3,     1,   688,     4,
       10492, 10493,   973,     8,     1,   377,   888,  1561,  7414,
          31,     5,    20,   232,     8,     1,    72,     3,  1801,
        2701,   800,    68,     1,   455,     2, 14184,     3,   888,
          31,  4002,     6,    26,  2701,    15,     2,   406,  2498,
        2343,    97, 14185,     9,   801,     3])

In [139]:
generated_indexes

array([    9,    35,   420,   838,   332,     3,     1,   688,     4,
       10492, 10493,   973,     8,     1,   377,   888,  1561,  7414,
          31,     5,    20,   232,     8,     1,    72,     3,  1801,
        2701,   800,    68,     1,   455,     2, 14184,     3,   888,
          31,  4002,     6,    26,  2701,    15,     2,   406,  2498,
        2343,    97, 14185,     9,   801,     3,     1,     9,     9,
           9,     9,     9,     9,     9,     9])

In [153]:
generated_indexes = transformed
size = transformed.shape[1]
for i in range(10):
    last_index = size + i
    to_pass = generated_indexes[i:last_index].reshape(1, -1)
    generated_indexes = np.append(generated_indexes, model.predict_classes(to_pass))
print(' '.join(tokenizer_punct.index_word[ind] for ind in generated_indexes))

the process, a heel of black liquid fertilizer is first clarified by a prior art procedure using flocculants. The improvement over the prior art picks up with the byproduct tops which is then diluted with the water of formulation required to dissolve additional amounts of ammonium polyphosphate melt. This diluted and a second and a second portion of the first


In [154]:
generated_indexes

array([    1,   455,     2, 14184,     3,   888,    31,  4002,     6,
          26,  2701,    15,     2,   406,  2498,  2343,    97, 14185,
           9,   801,   254,     1,   406,  2498,  7415,   248,    13,
           1,  2206,  6577,    17,     6,    91,  2499,    13,     1,
          27,     3,  7416,   555,     5,  3362,   342,   879,     3,
        1561,  7414,  5886,   166,  2499,     4,     2,    37,     4,
           2,    37,    45,     3,     1,    26])

In [148]:
model.predict_classes(np.append(transformed, [3, 1, 9])[3:53].reshape(1,-1))

array([9])

In [155]:
import pickle

In [156]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)