In [1]:

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, PDFPageAggregator
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFTextExtractionNotAllowed, PDFPage
from pdfminer.layout import LAParams, LTTextBox, LTLine, LTFigure, LTImage, LTRect, LTTextLine
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from owlready2 import *
from nltk.stem.porter import *
import io
import re
import os
from nltk.stem.porter import *
import operator
from PyPDF2 import PdfFileReader
import pandas as pd




################################ INITIALIZE VARIABLES ##############################################################

base_path = 'desktop/ml/'
path = '3AC1DACD68A35562618B2A9D7B92DE841964B.pdf'
my_pdf = os.path.join(base_path+"/"+path)

#keyword frequency threshold value, for the entire document
threshold = 0.03
#keyword frequency threshold value, for each page
page_threshold = 0.005
#number of levels of the ontology to display to the user
number_of_levels = 3
class_names = ['DRE Technologies', 'DRE Policies', 'DRE Economics', 'DRE Impacts']
ontology_path = "file:///users/jadewu/downloads/root-ontology-v7.owl"
print_ontology = False
print_stemmed_ontology = False
display_frequency_matrix = False
print_matched_words = True
print_pmatches_by_page = True
print_pmatches_by_keyword = True
print_all_chains = False
print_simplified_chains = True



In [4]:

###################################### LOAD/PROCESS THE ONTOLOGY ##################################################


stemmer = PorterStemmer()
onto = get_ontology(ontology_path).load()

#store all terms in the ontology into "arr"
arr = []
for i in class_names:
    for j in onto.search(label = i)[0].instances():
        arr.append(j.label)
for m in onto.classes():
    arr.append(m.label)
    
if print_ontology:
    print("the entire ontology: ")
    print(arr)

#stem the ontology {key, value} = {stemmed word, original word}, store into "arr_stemmed_ontology"
arr_stemmed_ontology = []
for i in range(len(arr)): 
    real_label = ''       
    #CASE #1: ['preferred label', 'label 2', 'label 3', ...] (or, there are multiple labels for a term in ontology)
    if (len(arr[i]) > 1):  
        for j in range(len(arr[i])):
            is_preflabel = len(onto.search(prefLabel = arr[i][j])) 
            if (is_preflabel == 0):      
                continue
            real_label = arr[i][j]     
        if (real_label == ''):
            raise Exception(arr[i][j] + " has no preferred label")     
        for x in range(len(arr[i])):
            arr_stemmed2 = ''
            word = arr[i][x].split()
            for k in range(len(word)):
                arr_stemmed2 = arr_stemmed2 + stemmer.stem(word[k].lower()) + " "   
            arr_stemmed_ontology.append((arr_stemmed2.strip(), real_label)) 
    #CASE #2: ['only label']
    else:               
        real_label = arr[i][0]
        arr_stemmed2 = ''
        word = real_label.split()
        for k in range(len(word)):
            arr_stemmed2 = arr_stemmed2 + stemmer.stem(word[k].lower()) + " "   
        arr_stemmed_ontology.append((arr_stemmed2.strip(), " ".join(word))) 
       
    
if print_stemmed_ontology:
    print("\nstemmed instances:")
    print(arr_stemmed_ontology)




#################################### FINDING MATCHES BETWEEN PDF AND ONTOLOGY #####################################


'''CREATE A FREQUENCY MATRIX WHERE:
    ROWS = PAGE NUMBER
    COLUMNS = ONTOLOGY
    CELLS = # OF TIMES A KEYWORD IN ONTOLOGY APPEARS ON PAGE __'''


#function to process pdf text, for each page
def process_pdf(page_text):
    page_text = re.sub("\.", "", page_text)
    page_text = re.sub("[0-9]+", "", page_text)
    page_text = re.sub("-\n", "", page_text)
    page_text = page_text.lower()
    page_text = re.sub("(\W+)", " ", page_text)
    page_text = re.sub("ﬁ", "fi", page_text)
    pdf_arr = page_text.split()
    pdf_arr_stemmed = []
    for i in range(len(pdf_arr)):
        pdf_arr_stemmed.append(stemmer.stem(pdf_arr[i]))
    page_text = " ".join(pdf_arr_stemmed)
    return page_text


#initialize frequency matrix
frequency_matrix = []

fp = open(my_pdf, "rb")
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
#initialize page counter
page_number = 0
#variable to store processed text for the whole pdf document
doc_text = ""
for page in PDFPage.create_pages(document):
    #variable to store processed text for each page
    page_text = ""
    interpreter.process_page(page)
    layout = device.get_result()
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            page_text += lt_obj.get_text()
    page_text = process_pdf(page_text)
    doc_text += page_text
    frequency_matrix.append([]) 
    for i in range(len(arr_stemmed_ontology)):
        key = arr_stemmed_ontology[i][0] #[0] = stemmed keyword
        value = arr_stemmed_ontology[i][1] #[1] = original keyword
        regex = key
        freq = re.findall(regex, page_text)
        frequency_matrix[page_number].append(len(freq))
    page_number += 1

#converting frequency matrix into dataframe, renaming column headers
column_headers = []
for tup in arr_stemmed_ontology:
    column_headers.append(tup[1])
frequency_matrix_df = pd.DataFrame(frequency_matrix, columns = column_headers)

#display frequency matrix
if display_frequency_matrix:
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(frequency_matrix_df)

        
'''DISPLAY MATCHES, ON A PAGE BY PAGE BASIS:
    FIRST DICTIONARY: pmatches_by_page: {key, value} = {page #, frequent word appeareances according to page_threshold}
    SECOND DICTIONARY: pmatches_by_keyword: {key, value} = {matched word, pages that word appears on, in order of importance}'''


pdf2 = PdfFileReader(open(my_pdf, 'rb'))
num_of_pages = pdf2.getNumPages()

'''FIRST DICTIONARY'''
pmatches_by_page = {}
for p in range(num_of_pages):
    page = pdf2.getPage(p)
    num_of_words_page = (len(page.extractText()))
    pmatches_by_page[p] = []
    if (num_of_words_page == 0):
        continue
    for x in range(len(frequency_matrix[p])):     
        if (frequency_matrix[p][x]/num_of_words_page > page_threshold):
                pmatches_by_page[p].append(arr_stemmed_ontology[x][1])

if print_pmatches_by_page:   
    print("\nMATCHES BY PAGE: ")
    for key in pmatches_by_page:
        print(str(key+1), pmatches_by_page[key])


'''FIND MATCHES BETWEEN ONTOLOGY AND THE ENTIRE DOCUMENT WHERE:
    matches_count (dictionary) = number of times each ontology word appears in the entire document
    matches_freq (dictionary) = number of times each ontology word appears in the entire document/total number of words'''

matches_count = {}
matches_freq = {}

#num_of_words = total number of relevant words in the document
num_of_words = len(doc_text.split())
for i in range(len(arr_stemmed_ontology)):
    key = arr_stemmed_ontology[i][0]
    value = arr_stemmed_ontology[i][1]
    regex = key
    #print(regex)
    freq = re.findall(regex, doc_text)
    if (len(freq)/num_of_words)*100 < threshold:
        continue 
    if value not in matches_count.keys():
        matches_count[value] = len(freq)
        matches_freq[value] = round((len(freq)/num_of_words)*100, 5) 
    else:
        matches_count[value] += len(freq)
        matches_freq[value] += round((len(freq)/num_of_words)*100, 5)

    
#Sort matches
matches_count_sorted = sorted(matches_count.items(), key=operator.itemgetter(1), reverse=True)
matches_freq_sorted = sorted(matches_freq.items(), key=operator.itemgetter(1), reverse=True)


#initialize array of matches, sorted
matches_sorted = [] 
for i in range(len(matches_freq_sorted)):
    matches_sorted.append(matches_freq_sorted[i][0])


#print matched words with count and frequency

if print_matched_words:
    print("\nMATCHED WORDS, FOR ENTIRE DOCUMENT: ")
    print("Total number of matched words: " + str(len(matches_sorted)))
    for word in matches_sorted:
        print (word + " appears "+ str(matches_count[word]) + " times with frequency " + str(matches_freq[word]))


'''SECOND DICTIONARY'''
pmatches_by_keyword = {}
for word in matches_sorted:  
    if word not in pmatches_by_keyword.keys():
        pmatches_by_keyword[word] = []
    for position, header in enumerate(frequency_matrix_df.columns.values.tolist()):
        if (header == word):
            column_num = position
    page_list = frequency_matrix_df.iloc[:,column_num].tolist()
    #print(word)
    #print(page_list)
    while(max(page_list) != 0):
        #print("while loop")
        index = page_list.index(max(page_list))
        pmatches_by_keyword[word].append(index+1)
        page_list[index] = 0
      
    
    
if print_pmatches_by_keyword:   
    print("\nMATCHES BY KEYWORD: ")
    for key in pmatches_by_keyword:
        print(key, pmatches_by_keyword[key])


###################################### FINDING PARENT CHAINS OF EACH KEYWORD ####################################


#recursive function to find the parent of a word

#word = word match found; TYPE = STRING 
def findParent(word, chain):        
    parent = onto.search(label = word)
    parent = parent[0].is_a[0]
    parent = parent.label
    if len(parent) == 0:
        return
    chain.insert(0, parent[0])
    findParent(parent[0], chain)

#finding all parent chains of all matched words

all_chains = []
for i in range(len(matches_sorted)): 
    #print(matches_sorted[i])
    parent = []
    parent_chain = [matches_sorted[i]]
    findParent(matches_sorted[i], parent_chain)
    all_chains.append(parent_chain)
    #print(parent_chain) 

if print_all_chains:
    print("\nALL CHAINS: ")
    print ("Total number of chains: " + str(len(all_chains)))
    for i in all_chains:
        print(i)
   


 ##########################################  CREATE TREE WITH PATHS   ###########################################

class Node(object):
    def __init__(self,data=''):
        self.visited = False
        self.data = data
        self.child = []
        
    def createNode(self, data):
        return Node(data)
    
    def createChildren(self,info):
        for i in range(len(info)):
            n = self.createNode(info[i])
            self.child.append(n)
    
    def add_children(self, count): 
        for i in range(len(self.child)):
            kids = set([])
            for j in range(len(all_chains)):
                if (count >= len(all_chains[j])):
                    continue
                if (count < len(all_chains[j])):    
                    if (all_chains[j][count - 1] == self.child[i].data):
                        kids.add(all_chains[j][count])
            if (len(list(kids)) == 0):
                return
            self.child[i].createChildren(list(kids))
            self.child[i].add_children(count+1)
          
    def traverse(self,local_path):
        path = []
        if (self.data is not ''):
            local_path.append(self.data)
        if len(self.child) != 0:
            for n in self.child:
                path.extend(n.traverse(local_path[:]))
        else:
            path.append(local_path)
        return path


'''TRAVERSE THROUGH THE TREE,
    PRINT OUT A SIMPLIFIED LIST OF PARENT CHAIN PATHS
    ie, A -> B -> C -> D
    [A, B, C] [A, B] -> ONLY [A, B, C] IS PRINTED'''

root = Node()

#initializing first layer of tree
first_level = set([])
for i in range(len(all_chains)):
    first_level.add(all_chains[i][0])
root.createChildren(list(first_level))

#create the rest of the tree
root.add_children(1)

#traverse through the paths of the tree
path = root.traverse([])
if print_simplified_chains:
    print("\nSIMPLIFIED CHAINS: ")
    print ("Total number of chains: "+str(len(path)))
    for i in path:
        print(i[:number_of_levels])


    




MATCHES BY PAGE: 
1 ['Health', 'Affordability']
2 ['Education', 'Wind']
3 ['Private', 'Wind']
4 []
5 ['Tools', 'Market', 'Cost']
6 ['Wind', 'Cost']
7 ['Investment', 'Cost']
8 ['Subsidization', 'Affordability', 'Cost']
9 ['Benefits', 'Education', 'Market', 'Cost']
10 ['Utility', 'Innovations', 'Benefits', 'Education', 'Investment', 'Market', 'Cost', 'Customer', 'Financing']
11 ['Wind', 'Cost']
12 ['Benefits', 'Wind', 'Market', 'Financing']

MATCHED WORDS, FOR ENTIRE DOCUMENT: 
Total number of matched words: 18
Cost appears 60 times with frequency 0.69029
Market appears 22 times with frequency 0.25311
Education appears 16 times with frequency 0.18408
Wind appears 16 times with frequency 0.18408
Benefits appears 15 times with frequency 0.17257
Investment appears 14 times with frequency 0.16107
Affordability appears 10 times with frequency 0.11505
Financing appears 10 times with frequency 0.11505
Innovations appears 8 times with frequency 0.09204
Subsidization appears 8 times with frequen