In [1]:
from helper import *

# Read PDF

In [5]:
def create_doc_tree(filename):
    pdf_file = filename + '.pdf'
    docx_file = filename + '.docx'
    # if docx doesn't exist
    # parse(pdf_file, docx_file)

    document = Document(docx_file)
    id_ = 0
    text = {}
    adj_list = {}
    parent = {}
    prev_titles = []
    for para in document.paragraphs:
        if para.text:
            id_ += 1
            # get size
            size = 0
            if para.style.font.size != None:
                size = para.style.font.size/12700
            for run in para.runs:
                if run.font.size:
                    size = max(size, run.font.size/12700)
            # get prev title list
            while len(prev_titles) and prev_titles[-1]['size'] <= size:
                prev_titles.pop()
            if prev_titles:
                adj_list[prev_titles[-1]['id']].append(id_)
                parent[id_] = prev_titles[-1]['id']
            else:
                parent[id_] = None
            prev_titles.append({'size': size, 'id': id_})
            adj_list[id_] = []
            text[id_] = preprocess_paragraph(para.text)
    return text, adj_list, parent

def create_kg(documents):
    subject_edges = set([])
    modifier_edges = set([])
    subject_modifier_edges = set([])
    relation_edges = []
    other_edges = set([])

    did = 0
    sid = 0
    eid = 0
    pid = 0
    tid = 0
    count = 0
    eid_to_text = {}
    text_to_eid = {}
    sid_to_text = {}
    text_to_sid = {}
    pid_to_text = {}
    text_to_pid = {}
    tid_to_text = {}
    text_to_tid = {}
    did_to_text = {}
    text_to_did = {}
    entities = set([])
    # for each document get the paragraphs and topics
    for doc in documents:
        if doc not in text_to_did:
            text_to_did[doc] = did
            did_to_text[did] = doc
            did += 1
        # read document in python
        text, adj_list, parent = create_doc_tree(doc)
        
        # traverse doc tree to identify paragraphs and topics in the document
        for node in adj_list:
            if adj_list[node]:
                if text[node] not in text_to_tid:
                    text_to_tid[text[node]] = tid
                    tid_to_text[tid] = text[node]
                    tid += 1
                if parent[node]:
                    if text[parent[node]] not in text_to_tid:
                        text_to_tid[text[parent[node]]] = tid
                        tid_to_text[tid] = text[parent[node]]
                        tid += 1
                    other_edges.add(('t_' + str(text_to_tid[text[node]]), 'about_concept', 't_' + str(text_to_tid[text[parent[node]]])))
            else:
                if text[node] not in text_to_pid:
                    text_to_pid[text[node]] = pid
                    pid_to_text[pid] = text[node]
                    pid += 1
                if text[parent[node]] not in text_to_tid:
                    text_to_tid[text[parent[node]]] = tid
                    tid_to_text[tid] = text[parent[node]]
                    tid += 1
                other_edges.add(('p_' + str(text_to_pid[text[node]]), 'about_concept', 't_' + str(text_to_tid[text[parent[node]]])))
                other_edges.add(('p_' + str(text_to_pid[text[node]]), 'from_document', 'd_' + str(text_to_did[doc])))
    
    # for each paragraph get sentences
    for pid in pid_to_text:
        sentences = split_into_sentences(pid_to_text[pid])
        for sentence in sentences:
            if sentence not in text_to_sid:
                text_to_sid[sentence] = sid
                sid_to_text[sid] = sentence
                sid += 1
            other_edges.add(('p_' + str(pid), 'contains_sentence', 's_' + str(text_to_sid[sentence])))
    
    # for each sentence get extractions
    for sid in sid_to_text:
        extractions = extract(sid_to_text[sid])
        if extractions:
            count += 1
            for extraction in extractions:
                eid_to_text[eid] = extraction
                other_edges.add(('s_' + str(sid), 'contains_extraction', 'e_' + str(eid)))
                eid += 1
        else:
            keywords = find_keywords(sid_to_text[sid])
            for keyword in keywords:
                other_edges.add(('s_' + str(sid), 'about_entity', keyword[0]))
                entities.add(keyword[0])
    
    # canonicalise the obtained extractions
    canonicalise(eid_to_text)
    
    # for each extraction create entities and relations
    for eid in eid_to_text:
        ext = eid_to_text[eid]
        
        subject_edges.add(('e_' + str(eid), 'subject', ext['subject']))
        entities.add(ext['subject'])
        
        relation_edges.append(('e_' + str(eid), ext['relation'], ext['object'], list(ext['rel_synsets'])))
        entities.add(ext['object'])
        
        for modifier in ext['modifiers']:
            modifier_edges.add(('e_' + str(eid), modifier['m_rel'], modifier['m_obj']))
            entities.add(modifier['m_obj'])
            
        for subject_modifier in ext['subject_modifiers']:
            subject_modifier_edges.add(('e_' + str(eid), subject_modifier['m_rel'], subject_modifier['m_obj']))
            entities.add(subject_modifier['m_obj'])
    
    print(did, tid, pid, sid, eid, count, round(10000 * count/sid)/100)
    
    vertices = {
        'documents': list({'id': 'd_' + str(k), 'text': did_to_text[k]} for k in did_to_text.keys()),
        'topics': list({'id': 't_' + str(k), 'text': tid_to_text[k]} for k in tid_to_text.keys()),
        'paragraphs': list({'id': 'p_' + str(k), 'text': pid_to_text[k]} for k in pid_to_text.keys()),
        'sentences': list({'id': 's_' + str(k), 'text': sid_to_text[k]} for k in sid_to_text.keys()),
        'extractions': list({'id': 'e_' + str(k), 'body': eid_to_text[k]} for k in eid_to_text.keys()),
        'entities': list(entities)
    }
    
    triples = {
        'main': list(other_edges),
        'subjects': list(subject_edges),
        'modifiers': list(modifier_edges),
        'subject_modifiers': list(subject_modifier_edges),
        'relations': list(relation_edges)
    }
    return vertices, triples

In [6]:
print(extract('Any condition arising in the B.Tech. program and not covered in the regulations shall be referred to the UG committee.'))
print(extract('I create and play games and puzzles and play sports.'))
print(extract('I create games if I have time.'))
print(extract('If I have time I create games.'))
print(extract('I have time which is used for games.'))
print(extract('Any problem shall be referred to the UG committee which may refer it to the Senate.'))
print(nlp('There will not be any late registration in the summer term and a student shall not be allowed to add a course after registration.'))
print(get_sentence_structure('After graduating from Columbia University in 1983, he worked as a community organizer in Chicago'))

[]
[{'subject': 'I', 'relation': 'create', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'create', 'object': 'puzzles', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'play', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'play', 'object': 'puzzles', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'play', 'object': 'sports', 'modifiers': [], 'subject_modifiers': [], 'condition': None}]
[{'subject': 'I', 'relation': 'create', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': 'I have time'}]
[{'subject': 'I', 'relation': 'create', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': 'I have time'}]
[{'subject': 'I', 'relation': 'have', 'object': 'time', 'modifiers': [{'m_rel': 'which is used for', 'm_obj': 'games'}], 'subject_modifie

In [7]:
triples = None
vertices = None
documents = ['../data/files/UG-Regulations']

vertices, triples = create_kg(documents)
print('did, tid, pid, sid, eid, count, percentage')

ERROR: should
ERROR: expulsion
1 60 132 290 171 118 40.69
did, tid, pid, sid, eid, count, percentage


In [9]:
write_json({'vertices': vertices, 'edges': triples}, '../data/handbook_graph.json')

# 

In [5]:
1 101 361 725 207 588 done 
1 87 300 665 202 582 done remove paras with < 5 words
1 87 284 648 202 582 done remove paras with < 6 words
1 86 264 628 199 579 done remove paras with < 7 words
2 91 307 697 241 671 done with 2 docs (+69 sentences, +42 extractions)
3 92 318 721 256 723 done with 3 docs (+24 sentences, +15 extractions)
3 92 318 721 256 717 done word sense disambiguation (-6 entities)

SyntaxError: invalid syntax (<ipython-input-5-9cd3be5a8dc6>, line 1)

In [16]:
print(nlp('when the course is replaced or repeated, the new grade will be used for computation of the CGPA'))

when the course is replaced or repeated, the new grade will be used for computation of the CGPA
