## Doc Classification Model Test

In [1]:
import csv
import pickle
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
labels = []
hashed_text = []

MIN_WORDS = 10     # a row must have at least MIN_WORDS
TEST_INTERVAL = 1200
line_ctr = 0

# read in data from file; add test record every TEST_INTERVAL
with open('C:\\Users\\Barnwaldo\\Documents\\Labs\\HeavyWater\\shuffled-full-set-hashed.csv', mode='r', encoding='utf-8') as lines:
    for line in lines:
        items = line.split(',')
        text = items[1].split(' ')
        text[-1] = text[-1].rstrip()
        tlen = len(text)
        if tlen < MIN_WORDS:
            continue
        # update labels and hashed_text lists
        if line_ctr % TEST_INTERVAL == 0:
            labels.append(items[0])
            hashed_text.append(text)
        line_ctr += 1
        
print("Length Hashed Text: {} -- Length Labels: {}".format(len(hashed_text), len(labels)))

Length Hashed Text: 52 -- Length Labels: 52


In [3]:
# Read in Tokenizer and LabelIndexer
tokenizer = Tokenizer(lower=False, filters='', num_words=None) 
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
label_indexer = {}
with open('label_indexer.json') as json_file:
    label_indexer = json.load(json_file)
    
label_dict = {}
for key, val in label_indexer.items():
    label_dict[val] = key

In [4]:
# Tokenize each row
raw_tokens = []
MAX_INDEX = 1037934 

for text in hashed_text:
    token_lists = tokenizer.texts_to_sequences(text)
    temp = [j for sub in token_lists for j in sub]
    raw_tokens.append(temp)
    
print("Length Raw Tokens: {}".format(len(raw_tokens)))

Length Raw Tokens: 52


In [5]:
# Truncate/Pad each row to fixed length; change to Numpy array
MAX_LEN = 200
X = np.empty((len(raw_tokens), MAX_LEN), dtype=np.int32)

for idx in range(len(raw_tokens)):
    tok_length = len(raw_tokens[idx]) 
    if tok_length < MAX_LEN:
        X[idx] = np.concatenate((raw_tokens[idx], np.zeros(MAX_LEN - tok_length, dtype=np.int32)), axis=0).astype(np.int32)
    else:
        X[idx] = np.array(raw_tokens[idx][:MAX_LEN], dtype=np.int32)

print("Number of Preprocessed Text Entries = {}".format(len(X)))

Number of Preprocessed Text Entries = 52


In [6]:
# Create One Hot Label Vectors
NUM_LABELS = 14

y = np.zeros((len(labels), NUM_LABELS), dtype=np.int32)

for i in range(len(labels)):
    y[i, label_indexer[labels[i]]] = 1

print("Number of Label Vector Entries = {}".format(len(y)))

Number of Label Vector Entries = 52


In [7]:
# Separable CNN Model
from tensorflow.keras import models, initializers
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dense, Dropout, Embedding, SeparableConv1D, MaxPooling1D, GlobalAveragePooling1D, BatchNormalization, ReLU

def sepcnn(max_word_index, num_feature, num_class, embedding_dim=100, kernel_size=3, pool_size=3, filters=64, dropout=0.1, activation='sigmoid'):
    model = models.Sequential()
    # Embedding layer. 
    model.add(Embedding(max_word_index, embedding_dim, input_length=num_feature))
    # CNN Block 1
    model.add(SeparableConv1D(filters=filters, kernel_size=kernel_size, padding='same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(SeparableConv1D(filters=filters, kernel_size=kernel_size, padding='same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    # CNN Block 2
    model.add(SeparableConv1D(filters=filters, kernel_size=kernel_size, padding='same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(SeparableConv1D(filters=filters, kernel_size=kernel_size, padding='same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    # CNN Block 3
    model.add(SeparableConv1D(filters=filters * 2, kernel_size=kernel_size, padding='same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(SeparableConv1D(filters=filters * 2, kernel_size=kernel_size, padding='same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(GlobalAveragePooling1D())
    # Fully Connected Layer
    model.add(Dropout(rate=dropout))
    model.add(Dense(num_class, activation=activation))
    # try using different optimizers and different optimizer configs
    model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
    return model

In [8]:
# BiLSTM Model
from tensorflow.keras.layers import LSTM, Bidirectional

def bilstm(max_word_index, num_feature, num_class, embedding_dim=100, dropout=0.1, activation='sigmoid'):
    model = models.Sequential()
    # Embedding layer. 
    model.add(Embedding(max_word_index, embedding_dim, input_length=num_feature))
    # BiLSTM layer
    model.add(Bidirectional(LSTM(64)))
    # Fully Connected Layer
    model.add(Dropout(dropout))
    model.add(Dense(num_class, activation=activation))
    # try using different optimizers and different optimizer configs
    model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
    return model

In [9]:
# Initialize Models and Load Model Weights
cnn_model = sepcnn(MAX_INDEX + 1, MAX_LEN, NUM_LABELS)
lstm_model = bilstm(MAX_INDEX + 1, MAX_LEN, NUM_LABELS)

cnn_model.load_weights('sep_cnn_weights.h5')
lstm_model.load_weights('bilstm_weights.h5')

In [12]:
cnn_results = []
lstm_results = []
np.set_printoptions(formatter={'float': '{: 0.3f}'.format}, linewidth=120)

print("##### LABELS, CNN_RESULTS, BiLSTM_RESULTS Output Probabilities #####")
record = np.empty((1, MAX_LEN), dtype=np.int32)
for i in range(len(X)):
    record[0, :] = X[i, :]
    cnn_predict = cnn_model.predict(record)
    lstm_predict = lstm_model.predict(record)
    predicted_label_index = np.argmax(cnn_predict, axis=1)
    predicted_label = label_dict[predicted_label_index[0]] 
    print("------{}------ Predicted Index: {} -- Predicted Label: {}".format(i, predicted_label_index[0], predicted_label))
    # print("------{}------ Predicted Index: {}".format(i, predicted_label_index))
    print("LABELS: {}".format(y[i,:].astype(np.float)))
    print("CNN   : {}".format(cnn_predict[0,:]))
    print("BiLSTM: {}".format(lstm_predict[0,:]))
    cnn_results.append(cnn_predict)
    lstm_results.append(lstm_predict)

##### LABELS, CNN_RESULTS, BiLSTM_RESULTS Output Probabilities #####
------0------ Predicted Index: 0 -- Predicted Label: DELETION OF INTEREST
LABELS: [ 1.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000]
CNN   : [ 1.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000]
BiLSTM: [ 0.759  0.011  0.000  0.005  0.351  0.044  0.002  0.014  0.018  0.000  0.001  0.054  0.001  0.000]
------1------ Predicted Index: 3 -- Predicted Label: POLICY CHANGE
LABELS: [ 0.000  0.000  0.000  1.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000]
CNN   : [ 0.000  0.000  0.000  1.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000]
BiLSTM: [ 0.000  0.000  0.004  0.990  0.003  0.025  0.000  0.000  0.010  0.000  0.000  0.000  0.001  0.003]
------2------ Predicted Index: 2 -- Predicted Label: BILL
LABELS: [ 0.000  0.000  1.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.00