In [1]:
from helper import *

# Read PDF

In [7]:
def create_doc_tree(filename):
    pdf_file = filename + '.pdf'
    docx_file = filename + '.docx'
    # if docx doesn't exist
    # parse(pdf_file, docx_file)

    document = Document(docx_file)
    id_ = 0
    text = {}
    adj_list = {}
    parent = {}
    prev_titles = []
    for para in document.paragraphs:
        if para.text:
            id_ += 1
            # get size
            size = 0
            if para.style.font.size != None:
                size = para.style.font.size/12700
            for run in para.runs:
                if run.font.size:
                    size = max(size, run.font.size/12700)
            # get prev title list
            while len(prev_titles) and prev_titles[-1]['size'] <= size:
                prev_titles.pop()
            if prev_titles:
                adj_list[prev_titles[-1]['id']].append(id_)
                parent[id_] = prev_titles[-1]['id']
            else:
                parent[id_] = None
            prev_titles.append({'size': size, 'id': id_})
            adj_list[id_] = []
            text[id_] = preprocess_paragraph(para.text)
    return text, adj_list, parent

def create_kg(documents):
    subject_edges = set([])
    modifier_edges = set([])
    subject_modifier_edges = set([])
    relation_edges = []
    other_edges = set([])

    did = 0
    tid = 10000
    pid = 20000
    sid = 30000
    eid = 40000
    xid = 50000
    count = 0
    eid_to_text = {}
    text_to_eid = {}
    sid_to_text = {}
    text_to_sid = {}
    pid_to_text = {}
    text_to_pid = {}
    tid_to_text = {}
    text_to_tid = {}
    did_to_text = {}
    text_to_did = {}
    xid_to_text = {}
    text_to_xid = {}
    # for each document get the paragraphs and topics
    for doc in documents:
        if doc not in text_to_did:
            text_to_did[doc] = did
            did_to_text[did] = doc
            did += 1
        # read document in python
        text, adj_list, parent = create_doc_tree(doc)
        
        # traverse doc tree to identify paragraphs and topics in the document
        for node in adj_list:
            if adj_list[node]:
                if text[node] not in text_to_tid:
                    text_to_tid[text[node]] = tid
                    tid_to_text[tid] = text[node]
                    tid += 1
                if parent[node]:
                    if text[parent[node]] not in text_to_tid:
                        text_to_tid[text[parent[node]]] = tid
                        tid_to_text[tid] = text[parent[node]]
                        tid += 1
                    other_edges.add((text_to_tid[text[node]], 'about_concept', text_to_tid[text[parent[node]]]))
            else:
                if text[node] not in text_to_pid:
                    text_to_pid[text[node]] = pid
                    pid_to_text[pid] = text[node]
                    pid += 1
                if text[parent[node]] not in text_to_tid:
                    text_to_tid[text[parent[node]]] = tid
                    tid_to_text[tid] = text[parent[node]]
                    tid += 1
                other_edges.add((text_to_pid[text[node]], 'about_concept', text_to_tid[text[parent[node]]]))
                other_edges.add((text_to_pid[text[node]], 'from_document', text_to_did[doc]))
    
    # convert topics to lower case
    for tid in tid_to_text:
        tid_to_text[tid] = {
            'text': tid_to_text[tid].lower(),
            'keywords': [k[0] for k in find_keywords(tid_to_text[tid].lower())],
            'tags': list(set([tag for k in find_keywords(tid_to_text[tid].lower()) for tag in k[1]]))
        }
    
    # for each paragraph get sentences
    for pid in pid_to_text:
        sentences = split_into_sentences(pid_to_text[pid])
        for sentence in sentences:
            if sentence not in text_to_sid:
                text_to_sid[sentence] = sid
                sid_to_text[sid] = sentence
                sid += 1
            other_edges.add((pid, 'contains_sentence', text_to_sid[sentence]))
    
    # for each sentence get extractions
    for sid in sid_to_text:
        extractions = extract(sid_to_text[sid])
        if extractions:
            count += 1
            for extraction in extractions:
                eid_to_text[eid] = extraction
                other_edges.add((sid, 'contains_extraction', eid))
                eid += 1
        else:
            keywords = find_keywords(sid_to_text[sid])
            for keyword in keywords:
                if keyword[0] not in text_to_xid:
                    text_to_xid[keyword[0]] = xid
                    xid_to_text[xid] = keyword
                    xid += 1
                other_edges.add((sid, 'about_entity', text_to_xid[keyword[0]]))
    
    # canonicalise the obtained extractions
    canonicalise(eid_to_text)
    
    # for each extraction create entities and relations
    for eid in eid_to_text:
        ext = eid_to_text[eid]
        
        if ext['subject'][0] not in text_to_xid:
            text_to_xid[ext['subject'][0]] = xid
            xid_to_text[xid] = ext['subject']
            xid += 1
        
        subject_edges.add((eid, 'subject', text_to_xid[ext['subject'][0]]))

        if ext['object'][0] not in text_to_xid:
            text_to_xid[ext['object'][0]] = xid
            xid_to_text[xid] = ext['object']
            xid += 1

        relation_edges.append((eid, ext['relation'], text_to_xid[ext['object'][0]], list(ext['rel_synsets'])))
        
        for modifier in ext['modifiers']:
            if modifier['m_obj'][0] not in text_to_xid:
                text_to_xid[modifier['m_obj'][0]] = xid
                xid_to_text[xid] = modifier['m_obj']
                xid += 1

            modifier_edges.add((eid, modifier['m_rel'], text_to_xid[modifier['m_obj'][0]]))
            
        for subject_modifier in ext['subject_modifiers']:
            if subject_modifier['m_obj'][0] not in text_to_xid:
                text_to_xid[subject_modifier['m_obj'][0]] = xid
                xid_to_text[xid] = subject_modifier['m_obj']
                xid += 1

            subject_modifier_edges.add((eid, subject_modifier['m_rel'], text_to_xid[subject_modifier['m_obj'][0]]))
    
    print(did, tid, pid, sid, eid, xid, count, round(10000 * count/sid)/100)
    
    offset = {
        'documents': 0,
        'topics': len(did_to_text),
        'paragraphs': len(did_to_text) + len(tid_to_text),
        'sentences': len(did_to_text) + len(tid_to_text) + len(pid_to_text),
        'extractions': len(did_to_text) + len(tid_to_text) + len(pid_to_text) + len(sid_to_text),
        'entities':  len(did_to_text) + len(tid_to_text) + len(pid_to_text) + len(sid_to_text) + len(eid_to_text)
    }
    
    vertices = {
        'documents': list({'id': k, 'text': did_to_text[k]} for k in did_to_text.keys()),
        'topics': list({'id': k, 'text': tid_to_text[k]['text'], 'keywords': tid_to_text[k]['keywords'], 'tags': tid_to_text[k]['tags']} for k in tid_to_text.keys()),
        'paragraphs': list({'id': k, 'text': pid_to_text[k]} for k in pid_to_text.keys()),
        'sentences': list({'id': k, 'text': sid_to_text[k]} for k in sid_to_text.keys()),
        'extractions': list({'id': k, 'body': eid_to_text[k]} for k in eid_to_text.keys()),
        'entities': list({'id': k, 'text': xid_to_text[k][0], 'tags': xid_to_text[k][1], 'tokens': xid_to_text[k][2]} for k in xid_to_text.keys())
    }
    
    triples = {
        'main': list(other_edges),
        'subjects': list(subject_edges),
        'modifiers': list(modifier_edges),
        'subject_modifiers': list(subject_modifier_edges),
        'relations': list(relation_edges)
    }
    return vertices, triples

In [8]:
print(extract('Any condition arising in the B.Tech. program and not covered in the regulations shall be referred to the UG committee.'))
print(extract('I create and play games and puzzles and play sports.'))
print(extract('I create games if I have time.'))
print(extract('If I have time I create games.'))
print(extract('I have time which is used for games.'))
print(extract('Any problem shall be referred to the UG committee which may refer it to the Senate.'))
print(nlp('There will not be any late registration in the summer term and a student shall not be allowed to add a course after registration.'))
print(get_sentence_structure('After graduating from Columbia University in 1983, he worked as a community organizer in Chicago'))

[]
[{'subject': 'I', 'relation': 'create', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'create', 'object': 'puzzles', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'play', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'play', 'object': 'puzzles', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'play', 'object': 'sports', 'modifiers': [], 'subject_modifiers': [], 'condition': None}]
[{'subject': 'I', 'relation': 'create', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': 'I have time'}]
[{'subject': 'I', 'relation': 'create', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': 'I have time'}]
[{'subject': 'I', 'relation': 'have', 'object': 'time', 'modifiers': [{'m_rel': 'which is used for', 'm_obj': 'games'}], 'subject_modifie

In [9]:
triples = None
vertices = None
documents = ['../data/files/UG-Regulations']

vertices, triples = create_kg(documents)
print('did, tid, pid, sid, eid, count, percentage')

ERROR: should
ERROR: expulsion
1 10059 20132 30290 40171 50760 118 0.39
did, tid, pid, sid, eid, count, percentage


In [10]:
write_json({'vertices': vertices, 'edges': triples}, '../neo4j/graph.json')

In [6]:
print(nlp('procedure').similarity(nlp('process')))

0.657118382755142
