### Import Data

In [2]:
import pandas as pd
import numpy as np


# string processing
import re

from keras.utils.np_utils import to_categorical

from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection, metrics

In [3]:
# to navigate to the data location
import os

# get current directory 
path = os.getcwd() 

# parent directory
parent = os.path.dirname(path)

df_merge_quality = pd.read_csv(parent + '/data/US_patent_abstract_5000_2015_with_title_1_5y.csv')
df_merge_quality.shape

(5000, 30)

In [4]:
df = df_merge_quality[['text', 'quality_rank']]
df

Unnamed: 0,text,quality_rank
0,Invitation information push method and system....,0
1,Coronal angulating connector. A connector is p...,0
2,Spearfishing apparatus. A device for spearfish...,1
3,Systems and methods for prioritizing media fil...,1
4,Semiconductor integrated circuit. A semiconduc...,0
...,...,...
4995,Cross-platform cloud-based map creation. Metho...,1
4996,Display substrate. A display substrate include...,1
4997,Aminoquinazoline derivatives and their salts a...,1
4998,Method and device for displaying information i...,1


In [5]:
df['text'].iloc[0]

'Invitation information push method and system. An invitation information push method includes after receiving an invitation request sent by a microblog user, a server sending invitation information to a number of clients corresponding to invited users carried in the invitation request, wherein the invited users are users who have not registered microblog, and the number of the invited users N is greater than or equal to 1. Each client, upon receiving the invitation information, creating an invitation information guide to guide the users who have not registered the microblog to register the microblog. The method further comprises, when a predetermined time is reached, a server actively sending invitation information to at least one client corresponding to at least one user who has not registered the microblog.'

In [6]:
df['text'] = df['text'].apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Create a model

In [7]:
import numpy as np
import pandas as pd
import pickle as cPickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

In [8]:
# more CNN library
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant


In [9]:
MAX_SEQUENCE_LENGTH = 1500  # [Steven] I added for the CNN, to take only the 500 words.
# MAX_SENT_LENGTH = 100
# MAX_SENTS = 15
MAX_NB_WORDS = 30000  # [Steven] this should be number of unique word / vocabulary
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [10]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", str(string))    
    string = re.sub(r"\'", "", str(string))    
    string = re.sub(r"\"", "", str(string))    
    return string.strip().lower()

In [11]:

data_train = df
data_train

Unnamed: 0,text,quality_rank
0,Invitation information push method and system....,0
1,Coronal angulating connector. A connector is p...,0
2,Spearfishing apparatus. A device for spearfish...,1
3,Systems and methods for prioritizing media fil...,1
4,Semiconductor integrated circuit. A semiconduc...,0
...,...,...
4995,Cross-platform cloud-based map creation. Metho...,1
4996,Display substrate. A display substrate include...,1
4997,Aminoquinazoline derivatives and their salts a...,1
4998,Method and device for displaying information i...,1


In [12]:
data_train.shape

(5000, 2)

In [13]:
import nltk
from nltk import tokenize

abstracts = [] # abstracts is list of list of list to hold each sentences of each abstract (the most complete data)
labels = [] # label is just a list holding our label which is quality_index
texts = []  # texts to hold each complete abstract as list of list (note: abstract not breaking up to sentence level)
for idx in range(data_train.text.shape[0]): # for each row
    #text = BeautifulSoup(data_train.text[idx])
    #print(clean_str(str(data_train.iloc[idx]['text'])))
    text = clean_str(str(data_train.iloc[idx]['text'])) # text is each complete abstract
    texts.append(text) # texts to hold each complete abstract as list of string (note: abstract not breaking up to sentence level)
    sentences = tokenize.sent_tokenize(text) # sentences is list of string holding each complete sentence of one abstract (but it's just an intermediate variable, not used directly in later code)
    abstracts.append(sentences) # abstracts is list of list of string to hold each sentences of each abstract (the most complete data) (this is what we use )
    labels.append(data_train.iloc[idx]['quality_rank']) # label is just a list holding our label which is quality_index

In [14]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) # intend to use next line .fit_on_texts to index each word within specific abstract at current iteration/loop, the more frequent word has lower index number, it is a dictionary format, it's like a unique vocabulary index
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
# data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

In [15]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 16864 unique tokens.


In [16]:
len(sequences)

5000

In [17]:
# create array of equal length to feed into model
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post') # The default is pre-padding, should I try post-padding ? => padding='post', truncating='post'. I think it has to do with whether the beginning words are more important, or later words more important. You may want to try both approaches and compare result?
data

array([[ 3894,    45,  2436, ...,     0,     0,     0],
       [ 9311, 11718,   278, ...,     0,     0,     0],
       [ 9312,    31,     2, ...,     0,     0,     0],
       ...,
       [ 9310,  1518,     4, ...,     0,     0,     0],
       [   15,     4,    12, ...,     0,     0,     0],
       [   24,     4,    15, ...,     0,     0,     0]], dtype=int32)

In [18]:
data.shape

(5000, 1500)

In [19]:
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (5000, 1500)
Shape of label tensor: (5000, 2)


In [20]:

num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
num_validation_samples

1000

In [21]:
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))


Number of positive and negative reviews in traing and validation set
[2397. 1603.]
[595. 405.]


In [22]:
y_train

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [23]:
y_val

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [24]:
# to navigate to the data location
import os

# get current directory 
path = os.getcwd() 

# parent directory
parent = os.path.dirname(path)

In [25]:
# import GloVe word embedding
GLOVE_DIR = "" 
embeddings_index = {}
f = open(os.path.join(parent, GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [26]:
print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [27]:
len(word_index) + 1

16865

In [28]:
# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))  # EMBEDDING_DIM = 100 since we import 100d GloVe
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [29]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [30]:
import os, sys, re, json, time, datetime, shutil
from numpy.random import seed
seed(5)
import tensorflow
tensorflow.random.set_seed(5)

In [31]:
# Parameter setting
epochs = 10
embed_dim = EMBEDDING_DIM  # already specify with EMBEDDING_DIM variable above
dense_layer_dims = [100]
dropout_rate = 0.2
num_filters = [25, 25, 50, 50, 50]
kernel_sizes = [3, 4, 15, 50, 150]
num_classes = labels.shape[1] # 

# Model building
start_time = time.time()
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
h = embedding_layer(sequence_input) # remember embedding_layer assign with GloVe's weight

conv_layers_for_all_kernel_sizes = []
counter = 0
for kernel_size, filters in zip(kernel_sizes, num_filters): 
    conv_layer = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(h)
    conv_layer = keras.layers.GlobalMaxPooling1D()(conv_layer)
    conv_layers_for_all_kernel_sizes.append(conv_layer)
    counter += 1
    #print(conv_layers_for_all_kernel_sizes)
if counter == 1: # need this since layers.concatenate throw error when there is only 1 item in conv_layers_for_all_kernel_sizes
    h = conv_layer
else:
    h = keras.layers.concatenate(conv_layers_for_all_kernel_sizes, axis=1) # Concat the feature maps from each different size.

    h = keras.layers.Dropout(rate=dropout_rate)(h) # Dropout can help with overfitting (improve generalization) by randomly 0-ing different subsets of values in the vector

for dim in dense_layer_dims:
    h = keras.layers.Dense(dim, activation = 'relu')(h) # [Steven] I believe this add additional fully-connected hidden layer with the hope to increase model accuracy

prediction = keras.layers.Dense(num_classes, activation='sigmoid')(h)

model = keras.Model(inputs=sequence_input, outputs=prediction)
model.compile(optimizer='adam',
              loss='binary_crossentropy',  # From information theory notebooks.
              metrics=['accuracy'])        # What metric to output as we train.



In [32]:
# setup checkpoint

checkpoint_path = "ckpt_CNN_US_5years/"

ckpt = tf.train.Checkpoint(model = model) 

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=2)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [33]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [37]:
# train
model.reset_states()
model.fit(x_train, 
          y_train, 
          epochs=epochs, 
          validation_data=(x_val, y_val),
         callbacks=[MyCustomCallback()])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f707bdd5550>

In [36]:
# restore epoch 9

checkpoint_path = "ckpt_CNN_US_5years/"

ckpt = tf.train.Checkpoint(model = model) 

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=2)

if ckpt_manager.latest_checkpoint:
    ckpt.restore('ckpt_CNN_US_5years/ckpt-9')
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [37]:
pred_test = model.predict(x_val)
pred_test

array([[0.73568666, 0.26373106],
       [0.31567356, 0.6768236 ],
       [0.21639621, 0.81224877],
       ...,
       [0.00127256, 0.99828047],
       [0.2456066 , 0.7296507 ],
       [0.6932094 , 0.3666255 ]], dtype=float32)

In [38]:
np.savetxt('Predict_Output/CNN_5yr_abstract_title_dev_prob.csv', pred_test)

In [39]:
predicted = [np.argmax(pred) for pred in 
             pred_test]

In [40]:
y_test_binary = df['quality_rank'][4000:].values


In [42]:
## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test_binary, predicted)
auc = metrics.roc_auc_score(y_test_binary, predicted)  # predicted_prob), check doc, seems the second argument required to be shape (n_samples,) for binary case 
                            #multi_class="ovr") # check documentation and seems "ovr" not good for only binary target class
print("Accuracy:",  round(accuracy,3))
print("Auc:", round(auc,3))
print("Detail:")
print(metrics.classification_report(y_test_binary, predicted))



Accuracy: 0.589
Auc: 0.545
Detail:
              precision    recall  f1-score   support

           0       0.62      0.77      0.69       595
           1       0.49      0.32      0.38       405

    accuracy                           0.59      1000
   macro avg       0.56      0.55      0.54      1000
weighted avg       0.57      0.59      0.57      1000

