## abs_claims

In [1]:
# import packages
import pandas as pd
import random

from sklearn.model_selection import train_test_split

from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
from keras import backend as K
import keras
import keras_metrics
import tensorflow as tf    
from tensorflow import keras 
import keras_metrics as km

In [2]:
# load the package
final_df = pd.read_csv('/project/sample.csv')

In [3]:
final_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,lens_id,date_published,publication_type,cpc,yo2,abstract.text_sw,biblio.invention_title.text_sw,claims.claims_sw,biblio.invention_title.text_token
0,99966,103917,165-470-842-755-208,2016-02-23,GRANTED_PATENT,"A61B17/0401,A61B17/0401,A61B2017/0404,A61B2017...",0,device method pushing anchor bore bone provide...,flipp tack pusher,a method for pushing an anchor through a bor...,"['flipp', 'tack', 'pusher']"
1,663454,691679,113-864-851-746-050,2016-12-15,PATENT_APPLICATION,"G03B7/097,H04N5/2352,H04N5/2352,H04N5/2353,H04...",0,imaging apparatus includes lens unit body unit...,unknown,an imaging apparatus comprising a lens uni...,['unknown']
2,540952,563920,015-657-426-811-531,2016-10-04,GRANTED_PATENT,"H04N19/154,H04N19/154,H04N7/17318,H04N7/17318,...",0,method extracting information encoded bit stre...,unknown,a method in a decoding device of adapting a ...,['unknown']
3,340420,354497,189-965-709-133-536,2016-05-10,GRANTED_PATENT,"C10G47/00,C10G47/00,C10G67/04,C10G67/04,C10G67...",0,hydrocarbon feedstock hydrocracked hydrocracki...,high quality middle distillate production process,a process for producing reduced aromatic hyd...,"['high', 'quality', 'middle', 'distillate', 'p..."
4,51087,53037,015-709-759-851-330,2016-03-17,PATENT_APPLICATION,"H02M3/33507,H02M3/33507,H02M1/32,H02M1/32,",0,operation flybuck converter first output capac...,overcurrent recovery in flybuck converters,a method of operating an isolated buck conv...,"['overcurrent', 'recovery', 'in', 'flybuck', '..."


In [4]:
final_df.shape

(50000, 11)

In [5]:
# split train and test
y = final_df['yo2']
X = final_df.drop(columns = 'yo2')
X['tac'] = X['biblio.invention_title.text_sw'] + ' ' + X['abstract.text_sw'] + ' ' + X['claims.claims_sw']
X['title_abs'] = X['biblio.invention_title.text_sw'] + ' ' + X['abstract.text_sw'] 
X['title_claims'] = X['biblio.invention_title.text_sw'] + ' ' + X['claims.claims_sw']
X['abs_claims'] = X['abstract.text_sw'] + ' ' + X['claims.claims_sw']
X['abstract'] = X['abstract.text_sw']
X['title'] = X['biblio.invention_title.text_sw']
X['claims'] = X['claims.claims_sw']

In [6]:
trainX, testX, trainy, testy = train_test_split(X['abs_claims'], y, test_size=0.25, random_state = 42)

In [7]:
trainX.shape,trainy.shape,testX.shape,testy.shape

((37500,), (37500,), (12500,), (12500,))

In [8]:
from pickle import dump

def save_dataset(dataset, filename):
    dump(dataset, open(filename, 'wb'))
    print('Saved: %s' % filename)
    
save_dataset([trainX,trainy], 'train_abs_claims.pkl') 

Saved: train_abs_claims.pkl


In [9]:
import csv
import pydot
filename = 'train_abs_claims.pkl'

In [10]:
# load a clean dataset
def load_dataset(filename):
    return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# calculate the maximum document length
def max_length(lines):
    return max([len(s.split()) for s in lines])

# encode a list of lines
def encode_text(tokenizer, lines, length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

# define the model
def define_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=2, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    
    # merge
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(500, activation='relu')(merged)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    # compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Precision()])
    # summarize
    print(model.summary())
    plot_model(model, show_shapes=True, to_file='multichannel_abs_claims.png')
    return model

In [11]:
# load training dataset
trainLines, trainLabels = load_dataset(filename)

# create tokenizer
tokenizer = create_tokenizer(trainLines)

# calculate max document length
length = max_length(trainLines)

# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)

# encode data
trains_X = encode_text(tokenizer, trainLines, length)
print(trains_X.shape)

# define model
model = define_model(length, vocab_size)

class_weights = 1/(trainLabels.value_counts()/len(trainLabels))
class_weights = {0: class_weights[0], 1: class_weights[1]}

# fit model
model.fit([trains_X,trains_X,trains_X], array(trainLabels), epochs=7, batch_size=16, class_weight=class_weights)

#list all data in history
print(history.history.keys())

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# save the model
model.save('model_abs_claims.h5')

Max document length: 18950
Vocabulary size: 81326
(37500, 18950)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 18950)]      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 18950)]      0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 18950)]      0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 18950, 100)   8132600     input_1[0][0]                    
_____________________________

KeyboardInterrupt: 