We build CNN models using keras for only layout input for section header classification. This is top level section
header classification. The classes are a section header or regular text. 

In [None]:
import tensorflow as tf
from tensorflow.contrib import keras
from tensorflow.contrib.keras import models
from tensorflow.contrib.keras import datasets
from tensorflow.contrib.keras import layers
from tensorflow.contrib.keras import preprocessing
from tensorflow.contrib.keras import backend as K
from tensorflow.contrib.keras import callbacks
from tensorflow.contrib.keras import utils

from collections import defaultdict
import pandas as pd
import numpy as np
import os
import json
import pickle
import sys
import random
import glob
from sklearn import metrics,cross_validation
import csv
from collections import Counter
import warnings
import string
import re
import nltk
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

learn = tf.contrib.learn
csv.field_size_limit(sys.maxsize)
tf.logging.set_verbosity(tf.logging.WARN)

In [None]:
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.95
#config.gpu_options.allow_growth=True
config.gpu_options.visible_device_list="0"
session = tf.Session(config=config)
keras.backend.set_session(session)
K.gpu_setup = ["gpu0"]

In [None]:
MAX_DOCUMENT_LENGTH=100
maxlen_word=20
num_classes=2
embedding_dims=100
filters=250
kernel_size = 3
hidden_dims = 100
batch_size=128

In [None]:
n_files=60
n_epoch=100
f= open("report_sections_cnn_both_layout_and_text_09_06_17_files_"+str(n_files)+"_epoch_"+str(n_epoch)+".txt","w")
f.write("Number of files: ")
f.write(str(n_files))
f.write("\nNumber of epoch: ")
f.write(str(n_epoch))
f.write("\n")
load_from_vector=False
model_dir="cnn_both_layout_and_text_section_header/"

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

Load previously data

In [None]:
# #load save character, word vector and layout vector and terget vector for test and training

x_train_char = pickle.load(open("cnn_only_text_section_header/x_train_char",'rb'))
x_train_layout = pickle.load(open("cnn_only_text_section_header/x_train_layout",'rb'))
y_train= pickle.load(open("cnn_only_text_section_header/y_train",'rb'))

x_test_char = pickle.load(open("cnn_only_text_section_header/x_test_char",'rb'))
x_test_layout = pickle.load(open("cnn_only_text_section_header/x_test_layout",'rb'))
y_test= pickle.load(open("cnn_only_text_section_header/y_test",'rb'))




Build model for only text input 

In [None]:
max_char=256
input_char =  layers.Input(shape=(MAX_DOCUMENT_LENGTH,), name = 'input_char')

embedding_char= layers.Embedding(max_char,embedding_dims,
                    input_length=MAX_DOCUMENT_LENGTH)(input_char)
dropout1_char = layers.Dropout(0.2)(embedding_char)

conv1D_char = layers.Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1)(dropout1_char)

globalMaxPooling1_char= layers.GlobalMaxPooling1D()(conv1D_char)

dense_hidden_char = layers.Dense(hidden_dims)(globalMaxPooling1_char)
dropout2_char = layers.Dropout(0.2)(dense_hidden_char)
activation_char= layers.Activation('relu')(dropout2_char)


In [None]:
print('Build model for layout...')
max_layout=x_train_layout.shape[1]

input_layout =  layers.Input(shape=(max_layout,), name = 'input_layout')
#dense_aspects= layers.Dense(max_aspects,activation='relu')(input_aspects)

embedding_layout = layers.Embedding(output_dim=embedding_dims, input_dim=max_layout, 
                                     input_length=max_layout)(input_layout)

conv1D_layout= layers.Conv1D(filters,kernel_size,padding='valid',activation='relu',
                              strides=1)(embedding_layout)
globalMaxPooling1_layout = layers.GlobalMaxPooling1D(name = 'maxpooling_layout')(conv1D_layout)

dense_hidden_layout = layers.Dense(hidden_dims)(globalMaxPooling1_layout)

#flatten_aspects = layers.Flatten()(embedding_aspects)
#dense_flatten = layers.Dense(50)(flatten_aspects)
#dense_hidden_aspects = layers.Dense(hidden_dims)(dense_flatten)

dropout_layout= layers.Dropout(0.2)(dense_hidden_layout)
activation_layout = layers.Activation("relu")(dropout_layout)

#class_aspects = layers.Dense(len(label_cat_id_to_int),activation="softmax")(dropout_aspects)
#model_aspects = models.Model(inputs=input_aspects,outputs=activation_aspects)


In [None]:
# merging layout layer and text layer
merged= layers.concatenate([activation_char, activation_layout])

merged_dense_hidden = layers.Dense(hidden_dims+hidden_dims)(merged)
merged_dropout= layers.Dropout(0.2)(merged_dense_hidden)
merged_activation = layers.Activation("relu")(merged_dropout)


In [None]:
main_output = layers.Dense(num_classes, activation='softmax', name='main_output')(merged_activation)

In [None]:
model = models.Model(inputs=[input_char, input_layout], outputs=[main_output])


In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print model.summary()

In [None]:
tb_callback = callbacks.TensorBoard(log_dir=model_dir, histogram_freq=50, write_graph=True,
                          embeddings_freq=50,embeddings_layer_names=None)

In [None]:
filepath=model_dir+"weights-improvement-{epoch:02d}-{val_acc:.9f}.hdf5"
checkpoint = callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
history = callbacks.History()

In [None]:
model.fit({"input_char":x_train_char,"input_layout":x_train_layout}, y_train,
          batch_size=batch_size,shuffle=True,
          epochs=n_epoch,verbose=1,
          validation_data=({"input_char":x_test_char,"input_layout":x_test_layout}, y_test
                          ),callbacks=[tb_callback,checkpoint,history])


In [None]:
model.save(model_dir+"final_model")

calculate scores for the final model. 

In [None]:
score = np.asarray(model.predict({"input_char":x_test_char,"input_layout":x_test_layout})).argmax(1)
predict = np.round(np.asarray(model.predict({"input_char":x_test_char,"input_layout":x_test_layout}))).argmax(1)
targ = y_test.argmax(1)
        
auc = metrics.roc_auc_score(targ, score)
confusion= metrics.confusion_matrix(targ, predict)
precision= metrics.precision_score(targ, predict)
recall=metrics.recall_score(targ, predict)
f1s=metrics.f1_score(targ, predict)
kappa=metrics.cohen_kappa_score(targ, predict)

In [None]:
print "Accuracy: ",auc
print "Precision: ",precision
print "Recall: ",recall
print "F1-Score: ",f1s
print "Kappa: ",kappa
print "Confusion Matrix: \n", confusion

f.write("\nAccuracy: "+str(auc))
f.write("\nPrecision: "+str(precision))
f.write("\nRecall: "+str(recall))
f.write("\nF1-Score: "+str(f1s))
f.write("\nKappa: "+str(kappa))
f.write("\nConfusion Matrix: \n"+str(confusion))


In [None]:
print "Done"
f.write("\nDone: See report file for more details result")
f.close()