In [2]:
from owlready2 import *
from nltk.stem.porter import *
import io
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from owlready2 import *
from nltk.stem.porter import *
import operator

########################################################VARIABLES#####################################################
base_path = 'desktop/OneDrive_2018-05-27/ml/'
path = 'A9305D8D85DB11365CB8B4F7EC8F65951C0DD97.pdf'
my_pdf = os.path.join(base_path+"/"+path)
threshold = 0.03
number_of_levels = 3
class_names = ['DRE Technologies', 'DRE Policies', 'DRE Economics', 'DRE Impacts']
ontology_path = "file:///users/brookeerickson/downloads/root-ontology-v7.owl"
print_ontology = True
print_stems = False

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,caching=caching,check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    return text

#stemming 
stemmer = PorterStemmer()
pdf = convert_pdf_to_txt(my_pdf)

#pre-processing text
pdf = re.sub("\.", "", pdf)
pdf = re.sub("[0-9]+", "", pdf)
pdf = re.sub("-\n", "", pdf)
pdf = pdf.lower()
pdf = re.sub("(\W+)", " ", pdf)
pdf = re.sub("ﬁ", "fi", pdf)
pdf_arr = pdf.split()


#number of words after stemming
num_of_words = len(pdf_arr)
print("# of words in pdf: " + str(num_of_words))

#join stemmed list into one string
pdf_arr_stemmed = []
for i in range(len(pdf_arr)):
    pdf_arr_stemmed.append(stemmer.stem(pdf_arr[i]))
pdf_string = " ".join(pdf_arr_stemmed)

###################################################FINDING MATCHES###############################################



stemmer = PorterStemmer()
onto = get_ontology(ontology_path).load()
    
##SAMPLE RECURSION
'''test = 'Private'
print(len(onto.search(label = test)))
parent = (onto.search(label = test)[0].is_a)[0].label
print(parent)
grandparent = (onto.search(label = parent)[0].is_a)[0].label
print(grandparent)
grandgrandparent = (onto.search(label = grandparent)[0].is_a)[0].label
print(grandgrandparent)'''



#########PRINTING ONTOLOGY
if print_ontology:
    arr = []
    for i in class_names:
        for j in onto.search(label = i)[0].instances():
        #for j in i.instances():
            arr.append(j.label)

    for m in onto.classes():
        arr.append(m.label)
    print('\neverything:')
    print(arr) ##list of synonym lists




##########STEMMING ONTOLOGY
print()
arr_stemmed_ontology = []
for i in range(len(arr)): ##iterate through each element in arr
    real_label = ''       ##initialize
    if (len(arr[i]) > 1):  ##CASE #1: ['preferred label', 'label 2', 'label 3', ...]
        for j in range(len(arr[i])): ## loop through the synonyms list
            is_preflabel = len(onto.search(prefLabel = arr[i][j])) 
            if (is_preflabel == 0):      ##if preferred label is the one being used
                continue
            real_label = arr[i][j]       ## else
        if (real_label == ''):
            raise Exception(arr[i][j] + " has no preferred label")     
        for x in range(len(arr[i])):
            arr_stemmed2 = ''
            word = arr[i][x].split()
            for k in range(len(word)):
                arr_stemmed2 = arr_stemmed2 + stemmer.stem(word[k].lower()) + " "   
            arr_stemmed_ontology.append((arr_stemmed2.strip(), real_label)) 
    else:               ##CASE #2: ['only label']
        real_label = arr[i][0]
        arr_stemmed2 = ''
        word = real_label.split()
        for k in range(len(word)):
            arr_stemmed2 = arr_stemmed2 + stemmer.stem(word[k].lower()) + " "   
        arr_stemmed_ontology.append((arr_stemmed2.strip(), " ".join(word))) 
if print_stems:
    print("\nstemmed instances:")
    print(arr_stemmed_ontology)




#MATCHING PDF DOC TO ONTOLOGY
matches_count = {}
matches_freq = {}
for i in range(len(arr_stemmed_ontology)):
    key = arr_stemmed_ontology[i][0]
    value = arr_stemmed_ontology[i][1]
    regex = key
    #print(regex)
    freq = re.findall(regex, pdf_string)
    if (len(freq)/num_of_words)*100 < threshold:
        continue 
    if value not in matches_count.keys():
        matches_count[value] = len(freq)
        matches_freq[value] = round((len(freq)/num_of_words)*100, 5) 
    else:
        matches_count[value] += len(freq)
        matches_freq[value] += round((len(freq)/num_of_words)*100, 5)

    
#SORT MATCHES
matches_count_sorted = sorted(matches_count.items(), key=operator.itemgetter(1), reverse=True)
matches_freq_sorted = sorted(matches_freq.items(), key=operator.itemgetter(1), reverse=True)

#print("\nmatches (count)")
#print(matches_count_sorted)
#print("\nmatches (frequency) IN PERCENT")
#print(matches_freq_sorted)

#RECURSIVELY FIND PARENTS ...


#INITIALIZE ARRAY OF MATCHES
matches_sorted = [] #a list of strings
for i in range(len(matches_freq_sorted)):
    matches_sorted.append(matches_freq_sorted[i][0])

#print("\nWord matches: ")
#print(matches_sorted)


#PRINT MATCHES
def print_matches(matches):
    for word in matches:
        print (word + " appears "+ str(matches_count[word]) + " times with frequency " + str(matches_freq[word]))
print_matches(matches_sorted)

#RECURSIVE FUNCTION
def findParent(word, chain):        #word = word match found; TYPE = STRING 
    parent = onto.search(label = word)
    parent = parent[0].is_a[0]
    parent = parent.label
    if len(parent) == 0:
        return
    chain.insert(0, parent[0])
    findParent(parent[0], chain)

#FINDING ALL PARENT CHAINS OF MATCHED WORDS
#print("\nParent chains for each match: ")
all_chains = []
for i in range(len(matches_sorted)): 
    #print(matches_sorted[i])
    parent = []
    parent_chain = [matches_sorted[i]]
    findParent(matches_sorted[i], parent_chain)
    all_chains.append(parent_chain)
    #print(parent_chain) 
#print("all chains: ")
#print(all_chains)
#print (len(all_chains))


################################################CREATE TREE WITH PATHS################################################
class Node(object):
    def __init__(self,data=''):
        self.visited = False
        self.data = data
        self.child = []
        
    def createNode(self, data):
        return Node(data)
    
    def createChildren(self,info):
        for i in range(len(info)):
            n = self.createNode(info[i])
            self.child.append(n)
    
    def add_children(self, count): 
        for i in range(len(self.child)):
            kids = set([])
            for j in range(len(all_chains)):
                if (count >= len(all_chains[j])):
                    continue
                if (count < len(all_chains[j])):    
                    if (all_chains[j][count - 1] == self.child[i].data):
                        kids.add(all_chains[j][count])
            if (len(list(kids)) == 0):
                return
            self.child[i].createChildren(list(kids))
            self.child[i].add_children(count+1)
          
    def traverse(self,local_path):
        path = []
        if (self.data is not ''):
            local_path.append(self.data)
        if len(self.child) != 0:
            for n in self.child:
                path.extend(n.traverse(local_path[:]))
        else:
            path.append(local_path)
        return path
    

root = Node()

##initializing first layer of tree
first_level = set([])
for i in range(len(all_chains)):
    first_level.add(all_chains[i][0])
root.createChildren(list(first_level))

##create the rest of the tree
root.add_children(1)

##traverse through the paths of the tree
path = root.traverse([])
print ("\nNumber of total paths: "+str(len(path)))
for i in path:
    print(i[:number_of_levels])


    

# of words in pdf: 13447

everything:
[['Cooking Fuels'], ['Small Scale'], ['Carpentry'], ['Welding'], ['Kerosene Stoves'], ['Alcohol Stoves'], ['Plant Oil Cookers'], ['Grid Interconnection Legislation'], ['Standards Agencies'], ['Minimum Quality Standards'], ['Licensing and Permitting'], ['Inspection Requirements'], ['Engines'], ['Solar Home Systems'], ['Solar Lighting Kits'], ['Biodiesel Water Pumps'], ['Solar Dryers'], ['Solar Thermal'], ['Dung-burning Cook stoves'], ['Coal Stoves'], ['Consumer Confidence'], ['Impact of non-certified products', 'non certified', 'non-certified'], ['LPG Stoves'], ['Biogas Stoves'], ['Gasifier Stoves'], ['Cooler'], ['Miller'], ['Grinder'], ['Fodder chopper'], ['Dryer'], ['Thresher'], ['Pumps'], ['Electric Razors'], ['Project Implementation'], ['Technical Training'], ['System Operations and Maintenance'], ['DC Optimizers'], ['Energy Storage Technology'], ['Hybrid Model'], ['Utility'], ['Personal', 'Private'], ['Cold Storage'], ['Commercial lighting'], [

In [3]:
number_of_levels = 5

class Node(object):
    def __init__(self,data=''):
        self.visited = False
        self.data = data
        self.child = []
        
    def createNode(self, data):
        return Node(data)
    
    def createChildren(self,info):
        for i in range(len(info)):
            n = self.createNode(info[i])
            self.child.append(n)
    
    def add_children(self, count): 
        for i in range(len(self.child)):
            kids = set([])
            for j in range(len(all_chains)):
                if (count >= len(all_chains[j])):
                    continue
                if (count < len(all_chains[j])):    
                    if (all_chains[j][count - 1] == self.child[i].data):
                        kids.add(all_chains[j][count])
            if (len(list(kids)) == 0):
                return
            self.child[i].createChildren(list(kids))
            self.child[i].add_children(count+1)
          
    def traverse(self,local_path):
        path = []
        if (self.data is not ''):
            local_path.append(self.data)
        if len(self.child) != 0:
            for n in self.child:
                path.extend(n.traverse(local_path[:]))
        else:
            path.append(local_path)
        return path
    

root = Node()

##initializing first layer of tree
first_level = set([])
for i in range(len(all_chains)):
    first_level.add(all_chains[i][0])
root.createChildren(list(first_level))

##create the rest of the tree
root.add_children(1)

##traverse through the paths of the tree
path = root.traverse([])
print ("number of total paths: "+str(len(path)))
for i in path:
    print(i[:number_of_levels])


number of total paths: 12
['DRE Technologies', 'Productive Use of Electrcity Appliances', 'ICT']
['DRE Technologies', 'Stand-alone Systems', 'Solar Home Systems']
['DRE Technologies', 'Mini-Grid Technology', 'Hydropower', 'Pico']
['DRE Technologies', 'Asset Ownership Models', 'Devices and Home Systems', 'Third Party Lease', 'Innovations']
['DRE Impacts', 'Socio-economic impacts', 'Affordability']
['DRE Impacts', 'Socio-economic impacts', 'Employment']
['DRE Impacts', 'Impact Evaluation', 'Quantitative Impact Evaluation', 'Quantitative Methods', 'Sampling']
['DRE Impacts', 'Impact Evaluation', 'Tools']
['DRE Impacts', 'Impact Evaluation', 'Qualitative Impact Evaluation']
['DRE Impacts', 'Social Impacts', 'Education']
['DRE Economics', 'Market', 'Customer']
['DRE Economics', 'Investment', 'Financing', 'Government Finance', 'Credits']


In [None]:
##SAMPLE RECURSION
test = 'Personal'
print("original word: " + test)
parent = onto.search(label = "Mini-Grid/Mini-Utility")[0]
print(type(parent))
child = parent.instances()
#for i in child:
    #print("instances of parent: ")
#    print(i.annotation_properties())

print(onto.search(prefLabel = "Loans"))
#for i in onto.annotation_properties():
#    print(type(i))
#    print(i)
'''for j in range(len(i.label)):
        if (test == i.label[j]):
            continue
        print(i.label[j])'''


'''inst = onto.search(label = "DRE Technologies")[0].instances()
for i in inst:
    print(i.label)
print(inst)

indi_len = len(onto.search(label = "Asset Ownership Models")[0].individuals())
print("instances: " + str(inst_len))
print("ind: " + str(indi_len))'''

'''print(parent_obj[0].type)
print(type(parent_obj))
print(parent_obj.individuals())
tmp = onto.search(label = parent.classes())[1].label
print(tmp)'''



In [None]:
for i in onto.classes():
    print(i.label)
    for j in i.instances():
        print(j.label)