Developing a deep neural network for section header dataset

In [1]:
from sklearn import metrics, cross_validation
import tensorflow as tf
import skflow
import csv
from collections import defaultdict
import numpy as np
import sys
from sklearn.externals import joblib
import json
import re
import nltk
from collections import Counter
import codecs

In [2]:
#datafile= "data_for_weka_all.csv"
datafile= "data_for_weka.csv"
acceptance_file= "acceptance_test_data_for_weka_03_02_16.csv"
duplicate_samples=1


In [3]:
# convert section header dataset for ddep neural network

sh_dataset = defaultdict(lambda : None)
sh_dataset['target_names'] =['yes','no']
sh_dataset['feature_names'] = ['pos_nnp', 'number_dot', 'header_2', 'seq_number', 'all_upper', 'header_1', 'at_least_3_lines_upper', 'text_len_group', 'font_weight', 'higher_line_space', 'header_0', 'bold_italic', 'colon', 'title_case', 'without_verb_higher_line_space']
sh_dataset['target'] =[]
sh_dataset['data'] =[]

with open(datafile, 'rb') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        for i in range(duplicate_samples):
            sh_dataset['target'].append(int(row['class']))
            sh_dataset['data'].append([int(row['pos_nnp']), int(row['number_dot']), int(row['header_2']), int(row['seq_number']), int(row['all_upper']), int(row['header_1']), int(row['at_least_3_lines_upper']), int(row['text_len_group']), int(row['font_weight']), int(row['higher_line_space']), int(row['header_0']), int(row['bold_italic']), int(row['colon']), int(row['title_case']), int(row['without_verb_higher_line_space'])])

# for acceptance test
sh_acceptance = defaultdict(lambda : None)
sh_acceptance['target_names'] =['yes','no']
sh_acceptance['feature_names'] = ['pos_nnp', 'number_dot', 'header_2', 'seq_number', 'all_upper', 'header_1', 'at_least_3_lines_upper', 'text_len_group', 'font_weight', 'higher_line_space', 'header_0', 'bold_italic', 'colon', 'title_case', 'without_verb_higher_line_space']
sh_acceptance['target'] =[]
sh_acceptance['data'] =[]

with open(acceptance_file, 'rb') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        sh_acceptance['target'].append(int(row['class']))
        sh_acceptance['data'].append([int(row['pos_nnp']), int(row['number_dot']), int(row['header_2']), int(row['seq_number']), int(row['all_upper']), int(row['header_1']), int(row['at_least_3_lines_upper']), int(row['text_len_group']), int(row['font_weight']), int(row['higher_line_space']), int(row['header_0']), int(row['bold_italic']), int(row['colon']), int(row['title_case']), int(row['without_verb_higher_line_space'])])


        
        

In [4]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(sh_dataset['data'], sh_dataset['target'],
    test_size=0.2, random_state=42)

print "Total samples: ",len(sh_dataset['data'])
print "Training samples: ",len(X_train)
print "Test samples: ",len(X_test)


print "Total negative samples: ",y_train.count(0)+y_test.count(0)
print "Total positive samples: ",y_train.count(1)+y_test.count(1)





X_train= np.array(X_train,dtype='float64')
X_test= np.array(X_test,dtype='float64')
y_train= np.array(y_train,dtype='float64')
y_test= np.array(y_test,dtype='float64')


# acceptance test data converting to numpy array 
X_acceptance= np.array(sh_acceptance['data'],dtype='float64')
y_acceptance= np.array(sh_acceptance['target'],dtype='float64')


Total samples:  476
Training samples:  380
Test samples:  96
Total negative samples:  238
Total positive samples:  238


In [5]:
print X_train, X_train.shape

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  1. ...,  0.  1.  0.]
 [ 1.  0.  1. ...,  0.  0.  1.]
 ..., 
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  1.]
 [ 1.  0.  0. ...,  0.  0.  0.]] (380, 15)


Following will create 3 layers of fully connected units with 10, 20 and 10 hidden units respectively, with default Rectified linear unit activations

In [6]:
# Build 3 layer DNN with 10, 20, 10 units respecitvely.
classifier = skflow.TensorFlowDNNClassifier(hidden_units=[100, 100, 200, 100, 100],
    n_classes=2, steps=10000,learning_rate=0.01)


In [7]:
# Fit and predict based on training and test data.
classifier.fit(X_train, y_train,logdir='result_dnn_section_header')
score = metrics.accuracy_score(y_test, classifier.predict(X_test))
print('Accuracy: {0:f}'.format(score))

print "\n\nMore details:"
predicted = classifier.predict(X_test)
print(metrics.classification_report(y_test, predicted))

# Printing the confusion matrix
print "Confusion Matrix"
cm = metrics.confusion_matrix(y_test, predicted)
print(cm)


# Predicting based on acceptance dataset
print "\n\nFor Acceptance test:"
predicted = classifier.predict(X_acceptance)
print(metrics.classification_report(y_acceptance, predicted))

# Printing the confusion matrix
print "Confusion Matrix"
cm = metrics.confusion_matrix(y_acceptance, predicted)
print(cm)




Step #100, epoch #8, avg. train loss: 0.61114
Step #200, epoch #16, avg. train loss: 0.38963
Step #300, epoch #25, avg. train loss: 0.31890
Step #400, epoch #33, avg. train loss: 0.28694
Step #500, epoch #41, avg. train loss: 0.26736
Step #600, epoch #50, avg. train loss: 0.24907
Step #700, epoch #58, avg. train loss: 0.23929
Step #800, epoch #66, avg. train loss: 0.22402
Step #900, epoch #75, avg. train loss: 0.21772
Step #1000, epoch #83, avg. train loss: 0.20859
Step #1100, epoch #91, avg. train loss: 0.20242
Step #1200, epoch #100, avg. train loss: 0.19384
Step #1300, epoch #108, avg. train loss: 0.18737
Step #1400, epoch #116, avg. train loss: 0.17752
Step #1500, epoch #125, avg. train loss: 0.17211
Step #1600, epoch #133, avg. train loss: 0.16177
Step #1700, epoch #141, avg. train loss: 0.16331
Step #1800, epoch #150, avg. train loss: 0.15031
Step #1900, epoch #158, avg. train loss: 0.14823
Step #2000, epoch #166, avg. train loss: 0.14174
Step #2100, epoch #175, avg. train loss: 

Process input data for classifier. This will generate necessary fields for the deep learning classifier. 
This will be used for testing the classifier. 

In [8]:
class Bunch(dict):
    """Container object for datasets: dictionary-like object that
        exposes its keys as attributes."""
    def __init__(self, **kwargs):
        dict.__init__(self, kwargs)
        self.__dict__ = self

def text_delexicalization(text):
    """delexicalization of each text string
    """
    regular_num = "#number "
    pattern_reg = re.compile('^(\d+(\.\d+)*(\.)?)|([a-z]+\.\s)', re.IGNORECASE)
    rep_text= pattern_reg.sub(regular_num,text)
    return rep_text

def generate_dataset(ann_file):
    target = []
    target_names = ['no','yes']
    feature_names = ['pos_nnp', 'number_dot', 'header_2', 'seq_number', 'all_upper', 'header_1', 'at_least_3_lines_upper', 'text_len_group', 'font_weight', 'higher_line_space', 'header_0', 'bold_italic', 'colon', 'title_case', 'without_verb_higher_line_space']
    rawtext = []
    no_delex_rawtext =[]
    data =[]
    file_names=[]
    auxiliary_verb = ["is","was","were","am","are","may","might","be","will","shall","should","must","need","have","can","could","ought","would"]
            
    all_json_objs={}
    with open(ann_file, 'rb') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            all_json_objs.setdefault(row['file_name'],[]).append(row)
    
    
    for reader in all_json_objs:
        each_file_json= all_json_objs[reader]
        print "processing file "+ reader
        all_font_weights=[]
        all_font_size=[]
        avg_font_weight =0.0
        avg_font_size =0.0
        avg_line_space =0.0
        minimum_line_space =100.0
        line_index=0
        counted_lines=0            
        
        for line in each_file_json:
            all_font_weights.append(line['font-weight'])
            all_font_size.append(line['font_size'])
            avg_font_weight += float(line['font-weight'])
            avg_font_size += float(line['font_size'])
            # line space 
            if line_index < len(each_file_json)-1:
                if each_file_json[line_index]["page-number"] == each_file_json[line_index+1]["page-number"]: 
                    if abs(float(each_file_json[line_index]["y-pos-l"]) - float(each_file_json[line_index+1]["y-pos-l"]))>50:
                        continue
                    avg_line_space += abs(float(each_file_json[line_index]["y-pos-l"]) - float(each_file_json[line_index+1]["y-pos-l"]))
                    if abs(float(each_file_json[line_index]["y-pos-l"]) - float(each_file_json[line_index+1]["y-pos-l"]))<minimum_line_space:
                        minimum_line_space = abs(float(each_file_json[line_index]["y-pos-l"]) - float(each_file_json[line_index+1]["y-pos-l"]))
                    counted_lines += 1
            
            line_index += 1
        if counted_lines !=0:
            avg_line_space = avg_line_space/counted_lines    
        avg_font_size = avg_font_size/len(each_file_json)
        avg_font_weight = avg_font_weight/len(each_file_json)
        
        font_weight_counter = defaultdict(int)
        for word in all_font_weights:  
            font_weight_counter[word] += 1
        font_weight_counter = sorted(font_weight_counter, key = font_weight_counter.get, reverse = True)
        
        font_size_counter = defaultdict(int)
        for word in all_font_size:  
            font_size_counter[word] += 1            
        font_size_counter = sorted(font_size_counter, key = font_size_counter.get, reverse = True)
        
        line_index=0
        for line in each_file_json:
            each_element={}
            
            each_element["text"]= line["text"]
            if len(line["text"].split())<4:
                each_element["text_len_group"]=1
            elif len(line["text"].split())<6:
                each_element["text_len_group"]=2
            else:
                each_element["text_len_group"]=3
            
            if ":" in line["text"].decode('utf-8'):
                each_element["colon"]=1
            else:    
                each_element["colon"]=0
                
            if re.match("((\d+|[a-z])\s?\.)",line["text"],re.IGNORECASE):
                each_element["number_dot"]=1
            else:
                each_element["number_dot"]=0                    
            
            #if re.match("((\d+|(IX|IV|V?I{0,3}))\s?(\.|\))(\d*))",line["text"],re.IGNORECASE):
            if re.match("(\d+|(([MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)))(\s|\.|\))?\d*",line["text"],re.IGNORECASE):
                each_element["seq_number"]=1
            else:
                each_element["seq_number"]=0    
                
            # case features
            each_element["at_least_3_lines_upper"] = 0
            if line["text"].isupper():
                each_element["all_upper"]=1
                if line_index > 0 and line_index< len(each_file_json)-1:
                    if each_file_json[line_index-1]["text"].isupper() and each_file_json[line_index+1]["text"].isupper():                     
                        each_element["at_least_3_lines_upper"]=1   
            else:
                each_element["all_upper"]=0    
            
            count_title=0
            for word in line["text"].decode('utf-8').split(" "):
                if word.istitle():
                    count_title+=1
            
            if count_title/float(len(line["text"].decode('utf-8').split(" ")))>0.50:
                each_element["title_case"]=1
            else:
                each_element["title_case"]=0
                
            
            verb_flag =0 # no auxiliary verb
            for verb in auxiliary_verb:
                if verb in line["text"].decode('utf-8').split(" "):
                    verb_flag=1
                    break
                
            each_element["without_verb_higher_line_space"] = 0
            if verb_flag == 0:      
                if line_index < len(each_file_json)-1 and line_index > 0:              
                    if each_file_json[line_index-1]["page-number"] == each_file_json[line_index]["page-number"] and each_file_json[line_index]["page-number"] == each_file_json[line_index+1]["page-number"]:
                        if abs(float(each_file_json[line_index]["y-pos-l"]) - float(each_file_json[line_index+1]["y-pos-l"]))>avg_line_space and abs(float(each_file_json[line_index-1]["y-pos-l"]) - float(each_file_json[line_index]["y-pos-l"]))>minimum_line_space:
                            each_element["without_verb_higher_line_space"] =1
                elif line_index > 0:
                    if each_file_json[line_index-1]["page-number"] == each_file_json[line_index]["page-number"]:
                        if abs(float(each_file_json[line_index-1]["y-pos-l"]) - float(each_file_json[line_index]["y-pos-l"]))>avg_line_space:
                            each_element["without_verb_higher_line_space"] =1        
                elif line_index < len(each_file_json)-1:
                    if each_file_json[line_index]["page-number"] == each_file_json[line_index+1]["page-number"]:
                        if abs(float(each_file_json[line_index]["y-pos-l"]) - float(each_file_json[line_index+1]["y-pos-l"]))>avg_line_space:
                            each_element["without_verb_higher_line_space"] =1        
                            
            # only line spaceing 
            each_element["higher_line_space"] = 0
            if line_index < len(each_file_json)-1 and line_index > 0:              
                if each_file_json[line_index-1]["page-number"] == each_file_json[line_index]["page-number"] and each_file_json[line_index]["page-number"] == each_file_json[line_index+1]["page-number"]:
                    if abs(float(each_file_json[line_index]["y-pos-l"]) - float(each_file_json[line_index+1]["y-pos-l"]))>avg_line_space and abs(float(each_file_json[line_index-1]["y-pos-l"]) - float(each_file_json[line_index]["y-pos-l"]))>minimum_line_space:
                        each_element["higher_line_space"] =1
            elif line_index > 0:
                if each_file_json[line_index-1]["page-number"] == each_file_json[line_index]["page-number"]:
                    if abs(float(each_file_json[line_index-1]["y-pos-l"]) - float(each_file_json[line_index]["y-pos-l"]))>avg_line_space:
                        each_element["higher_line_space"] =1        
            elif line_index < len(each_file_json)-1:
                if each_file_json[line_index]["page-number"] == each_file_json[line_index+1]["page-number"]:
                    if abs(float(each_file_json[line_index]["y-pos-l"]) - float(each_file_json[line_index+1]["y-pos-l"]))>avg_line_space:
                        each_element["higher_line_space"] =1        
                        
            
                                 
            #if line["font_size"]>12:
            if line["font_size"]>font_size_counter[0]:
            #if line["font_size"]>avg_font_size:    
                each_element["header_0"] =1
            else:
                each_element["header_0"] =0
            
            #if line["font_size"]>=12 and line["font-weight"]>=300:
            if line["font_size"]>=font_size_counter[0] and line["font-weight"]>font_weight_counter[0]:
            #if line["font_size"]>=avg_font_size and line["font-weight"]>avg_font_weight:
                each_element["header_1"] =1
            else:
                each_element["header_1"] =0
                    
            #if line["font_size"] >=12.0 and "bold" in line["font-family"].lower():
            if line["font_size"] >= font_size_counter[0] and "bold" in line["font-family"].lower():    
            #if line["font_size"] >= avg_font_size and "bold" in line["font-family"].lower():
                each_element["header_2"] =1
            else:
                each_element["header_2"] =0
            
            if line["font-weight"]>font_weight_counter[0]:
                each_element["font_weight"] =1
            else:
                each_element["font_weight"] =0
                    
            if "bold" in line["font-family"].lower() and "italic" in line["font-family"].lower():
                each_element["bold_italic"] =1
            else:
                each_element["bold_italic"] =0
                                    
             
            # POS tagging
            tokens = nltk.word_tokenize(line["text"].decode('utf-8'))
            text = nltk.Text(tokens)
            tags = nltk.pos_tag(text) 
            counts = Counter(tag for word,tag in tags)
            total_pos = sum(counts.values())
            pos = dict((word, float(count)/total_pos) for word,count in counts.items())
            
            if "NNP" in pos.keys() and "NN" in pos.keys():
                if pos["NNP"] + pos["NN"]  > 0.5:
                    each_element["pos_nnp"]=1
                else:
                    each_element["pos_nnp"]=0
            elif "NNP" in pos.keys():
                if pos["NNP"]  > 0.5:
                    each_element["pos_nnp"]=1
                else:
                    each_element["pos_nnp"]=0
            elif "NN" in pos.keys():
                if pos["NN"]  > 0.5:
                    each_element["pos_nnp"]=1
                else:
                    each_element["pos_nnp"]=0
            else:
                each_element["pos_nnp"]=0
                    
                            
            if line['class'] =="yes":
                target.append(1)
            else:
                target.append(0)                                                                                                                                                                                                                                                                                                                                                                                                          
            data.append([each_element["pos_nnp"],each_element['number_dot'],each_element["header_2"],each_element["seq_number"],each_element["all_upper"],each_element["header_1"],each_element["at_least_3_lines_upper"],each_element['text_len_group'],each_element['font_weight'],each_element['higher_line_space'],each_element['header_0'],each_element['bold_italic'],each_element['colon'],each_element["title_case"],each_element["without_verb_higher_line_space"]])
            rawtext.append(text_delexicalization(each_element['text']))
            no_delex_rawtext.append(each_element['text'])
            file_names.append(reader)
            line_index += 1
            
    return Bunch(data=data, feature_names=feature_names,target_names=target_names,target=target,rawtext=rawtext,no_delex_rawtext=no_delex_rawtext,filenames=file_names)        
    

In [9]:
test_dataset = generate_dataset("testset_acrobat_section_header.csv")

processing file 47c96cb39cc465d14c3cf90f7cae0014f1cc285a.file.pdf.tetml
processing file 565ad0cb1ff1baa09c498f981f697b1472c9f118.pdf.tetml
processing file bb1dfc701ed1df96de2f0b46cecd65c181007e68.file.pdf.tetml
processing file 834851bfb54f2b09aae586450df167ec4cca66e0.file.pdf.tetml
processing file 9cb270020c595cd07108750c9321bd6123dba8d9.pdf.tetml
processing file b2fb634f43889e936ad39ce5e4562ceff84e5604.file.pdf.tetml
processing file 151d93b34c6fc7b663d88e6ecf585f6e0054921b.pdf.tetml
processing file 1392203bd3e5ecc516bc30fb53dc55c672d53179.pdf.tetml
processing file 51c69584da480798a6218f47d8fa18339d561173.pdf.tetml


In [10]:
print "Feature length: ",len(test_dataset.data[0])
print "Feature names: ",test_dataset.feature_names

# for test data 
sh_test = defaultdict(lambda : None)
sh_test['target_names'] =['no','yes']
sh_test['feature_names'] = ['pos_nnp', 'number_dot', 'header_2', 'seq_number', 'all_upper', 'header_1', 'at_least_3_lines_upper', 'text_len_group', 'font_weight', 'higher_line_space', 'header_0', 'bold_italic', 'colon', 'title_case', 'without_verb_higher_line_space']
sh_test['target'] =[]
sh_test['data'] =[]

for row in test_dataset.data:
    #sh_test['target'].append(int(row['class']))
    sh_test['data'].append(row)


    
# test data converting to numpy array 
X_test= np.array(sh_test['data'],dtype='float64')

# Predicting based on acceptance dataset
print "\n\nTesting the classifier:"
predicted = classifier.predict(X_test)
out_file = codecs.open("result_section_header_deeplearning.txt", "w",encoding="utf-8")
unique_file_list=[]
for i in range(len(predicted)):
    if predicted[i] ==1:
        if test_dataset.filenames[i].split(".tetml")[0] not in unique_file_list:
            unique_file_list.append(test_dataset.filenames[i].split(".tetml")[0])
            out_file.write("\n\n")
            out_file.write(test_dataset.filenames[i].split(".tetml")[0])
            out_file.write("\n")
            out_file.write("======================================================")
            out_file.write("\n")
            out_file.write(test_dataset.no_delex_rawtext[i].decode('utf-8'))
            out_file.write("\n")
        else:
            out_file.write(test_dataset.no_delex_rawtext[i].decode('utf-8'))
            out_file.write("\n")

out_file.close()                
print "Done"

Feature length:  15
Feature names:  ['pos_nnp', 'number_dot', 'header_2', 'seq_number', 'all_upper', 'header_1', 'at_least_3_lines_upper', 'text_len_group', 'font_weight', 'higher_line_space', 'header_0', 'bold_italic', 'colon', 'title_case', 'without_verb_higher_line_space']


Testing the classifier:
Done
