In [1]:
from helper import *
from tqdm import tqdm

In [11]:
def create_doc_tree(filename):
    pdf_file = filename + '.pdf'
    docx_file = filename + '.docx'
    # if docx doesn't exist
    # parse(pdf_file, docx_file)

    document = Document(docx_file)
    id_ = 0
    text = {}
    adj_list = {}
    parent = {}
    prev_titles = []
    for para in document.paragraphs:
        if para.text:
            id_ += 1
            # get size
            size = 0
            if para.style.font.size != None:
                size = para.style.font.size/12700
            for run in para.runs:
                if run.font.size:
                    size = max(size, run.font.size/12700)
            # get prev title list
            while len(prev_titles) and prev_titles[-1]['size'] >= size:
                prev_titles.pop()
            if prev_titles:
                adj_list[prev_titles[-1]['id']].append(id_)
                parent[id_] = prev_titles[-1]['id']
            else:
                parent[id_] = None
            prev_titles.append({'size': size, 'id': id_})
            adj_list[id_] = []
            text[id_] = preprocess_paragraph(para.text)
    return text, adj_list, parent

def create_tree(document):
    id_ = 0
    text = {}
    adj_list = {}
    parent = {}
    prev_titles = []
    for element in document['content']:
        id_ += 1
        # get size
        size = element[1]
        # get prev title list
        while len(prev_titles) and prev_titles[-1]['size'] >= size:
            prev_titles.pop()
        if prev_titles:
            adj_list[prev_titles[-1]['id']].append(id_)
            parent[id_] = prev_titles[-1]['id']
        else:
            parent[id_] = None
        prev_titles.append({'size': size, 'id': id_})
        adj_list[id_] = []
        text[id_] = preprocess_paragraph(element[0])
    return text, adj_list, parent

def create_kg(documents):
    subject_edges = set([])
    modifier_edges = set([])
    subject_modifier_edges = set([])
    relation_edges = []
    other_edges = set([])

    did = 0
    tid = 10000
    pid = 20000
    sid = 30000
    eid = 40000
    xid = 50000
    count = 0
    eid_to_text = {}
    text_to_eid = {}
    sid_to_text = {}
    text_to_sid = {}
    pid_to_text = {}
    text_to_pid = {}
    tid_to_text = {}
    text_to_tid = {}
    did_to_text = {}
    text_to_did = {}
    xid_to_text = {}
    text_to_xid = {}
    # for each document get the paragraphs and topics
    for doc in tqdm(documents):
        if doc['format'] != 'html' or not 'body' in doc:
            continue
        if doc['name'] in text_to_did:
            continue
        text_to_did[doc['name']] = did
        did_to_text[did] = doc
        did += 1
        # read document in python
        text, adj_list, parent = create_tree(doc)
        
        # traverse doc tree to identify paragraphs and topics in the document
        for node in adj_list:
            if adj_list[node]:
                if text[node] not in text_to_tid:
                    text_to_tid[text[node]] = tid
                    tid_to_text[tid] = text[node]
                    tid += 1
                if parent[node]:
                    if text[parent[node]] not in text_to_tid:
                        text_to_tid[text[parent[node]]] = tid
                        tid_to_text[tid] = text[parent[node]]
                        tid += 1
                    other_edges.add((text_to_tid[text[node]], 'about_concept', text_to_tid[text[parent[node]]]))
            else:
                if text[node] not in text_to_pid:
                    text_to_pid[text[node]] = pid
                    pid_to_text[pid] = text[node]
                    pid += 1
                if parent[node]:
                    if text[parent[node]] not in text_to_tid:
                        text_to_tid[text[parent[node]]] = tid
                        tid_to_text[tid] = text[parent[node]]
                        tid += 1
                    other_edges.add((text_to_pid[text[node]], 'about_concept', text_to_tid[text[parent[node]]]))
                    other_edges.add((text_to_pid[text[node]], 'from_document', text_to_did[doc['name']]))
    # convert topics to lower case
    for tid in tqdm(tid_to_text):
        tid_to_text[tid] = {
            'text': tid_to_text[tid].lower(),
            'keywords': [k[0] for k in find_keywords(tid_to_text[tid].lower())],
            'tags': list(set([tag for k in find_keywords(tid_to_text[tid].lower()) for tag in k[1]]))
        }
    
    # for each paragraph get sentences
    for pid in tqdm(pid_to_text):
        sentences = split_into_sentences(pid_to_text[pid])
        for sentence in sentences:
            sentence = {
                'text': sentence, 
                'stemmed_tokens': get_stemmed_sentence_tokens(sentence)
            }
            if sentence['text'] not in text_to_sid:
                text_to_sid[sentence['text']] = sid
                sid_to_text[sid] = sentence
                sid += 1
            other_edges.add((pid, 'contains_sentence', text_to_sid[sentence['text']]))
    
    # for each sentence get extractions
    for sid in tqdm(sid_to_text):
        extractions = extract(sid_to_text[sid]['text'])
        if extractions:
            count += 1
            for extraction in extractions:
                eid_to_text[eid] = extraction
                other_edges.add((sid, 'contains_extraction', eid))
                eid += 1
        else:
            keywords = find_keywords(sid_to_text[sid]['text'])
            for keyword in keywords:
                if keyword[0] not in text_to_xid:
                    text_to_xid[keyword[0]] = xid
                    xid_to_text[xid] = keyword
                    xid += 1
                other_edges.add((sid, 'about_entity', text_to_xid[keyword[0]]))
    
    # canonicalise the obtained extractions
    canonicalise(eid_to_text)
    
    # for each extraction create entities and relations
    for eid in tqdm(eid_to_text):
        ext = eid_to_text[eid]
        
        if ext['subject'][0] not in text_to_xid:
            text_to_xid[ext['subject'][0]] = xid
            xid_to_text[xid] = ext['subject']
            xid += 1
        
        subject_edges.add((eid, 'subject', text_to_xid[ext['subject'][0]]))

        if ext['object'][0] not in text_to_xid:
            text_to_xid[ext['object'][0]] = xid
            xid_to_text[xid] = ext['object']
            xid += 1

        relation_edges.append((eid, ext['relation'], text_to_xid[ext['object'][0]], list(ext['rel_synsets'])))
        
        for modifier in ext['modifiers']:
            if modifier['m_obj'][0] not in text_to_xid:
                text_to_xid[modifier['m_obj'][0]] = xid
                xid_to_text[xid] = modifier['m_obj']
                xid += 1

            modifier_edges.add((eid, modifier['m_rel'], text_to_xid[modifier['m_obj'][0]]))
            
        for subject_modifier in ext['subject_modifiers']:
            if subject_modifier['m_obj'][0] not in text_to_xid:
                text_to_xid[subject_modifier['m_obj'][0]] = xid
                xid_to_text[xid] = subject_modifier['m_obj']
                xid += 1

            subject_modifier_edges.add((eid, subject_modifier['m_rel'], text_to_xid[subject_modifier['m_obj'][0]]))
    
    print(did, tid, pid, sid, eid, xid, count, round(10000 * count/sid)/100)
    
    offset = {
        'documents': 0,
        'topics': len(did_to_text),
        'paragraphs': len(did_to_text) + len(tid_to_text),
        'sentences': len(did_to_text) + len(tid_to_text) + len(pid_to_text),
        'extractions': len(did_to_text) + len(tid_to_text) + len(pid_to_text) + len(sid_to_text),
        'entities':  len(did_to_text) + len(tid_to_text) + len(pid_to_text) + len(sid_to_text) + len(eid_to_text)
    }
    
    vertices = {
        'documents': list({'id': k, 'text': did_to_text[k]['name'], 'source': did_to_text[k]['link']} for k in did_to_text.keys()),
        'topics': list({'id': k, 'text': tid_to_text[k]['text'], 'keywords': tid_to_text[k]['keywords'], 'tags': tid_to_text[k]['tags']} for k in tid_to_text.keys()),
        'paragraphs': list({'id': k, 'text': pid_to_text[k]} for k in pid_to_text.keys()),
        'sentences': list({'id': k, 'text': sid_to_text[k]['text'], 'stemmed_tokens': sid_to_text[k]['stemmed_tokens']} for k in sid_to_text.keys()),
        'extractions': list({'id': k, 'body': eid_to_text[k]} for k in eid_to_text.keys()),
        'entities': list({'id': k, 'text': xid_to_text[k][0], 'tags': xid_to_text[k][1], 'tokens': xid_to_text[k][2]} for k in xid_to_text.keys())
    }
    
    triples = {
        'main': list(other_edges),
        'subjects': list(subject_edges),
        'modifiers': list(modifier_edges),
        'subject_modifiers': list(subject_modifier_edges),
        'relations': list(relation_edges)
    }
    return vertices, triples

In [12]:
print(extract('Any condition arising in the B.Tech. program and not covered in the regulations shall be referred to the UG committee.'))
print(extract('I create and play games and puzzles and play sports.'))
print(extract('I create games if I have time.'))
print(extract('If I have time I create games.'))
print(extract('I have time which is used for games.'))
print(extract('Any problem shall be referred to the UG committee which may refer it to the Senate.'))
print(nlp('There will not be any late registration in the summer term and a student shall not be allowed to add a course after registration.'))
print(get_sentence_structure('After graduating from Columbia University in 1983, he worked as a community organizer in Chicago'))

[]
[{'subject': 'I', 'relation': 'create', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'create', 'object': 'puzzles', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'play', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'play', 'object': 'puzzles', 'modifiers': [], 'subject_modifiers': [], 'condition': None}, {'subject': 'I', 'relation': 'play', 'object': 'sports', 'modifiers': [], 'subject_modifiers': [], 'condition': None}]
[{'subject': 'I', 'relation': 'create', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': 'I have time'}]
[{'subject': 'I', 'relation': 'create', 'object': 'games', 'modifiers': [], 'subject_modifiers': [], 'condition': 'I have time'}]
[{'subject': 'I', 'relation': 'have', 'object': 'time', 'modifiers': [{'m_rel': 'which is used for', 'm_obj': 'games'}], 'subject_modifie

In [13]:
triples = None
vertices = None
documents = [{"name": "UG Regulations", "path": '../data/files/UG-Regulations', "source": "https://www.iiitd.ac.in/sites/default/files/docs/education/2019/2019-August-UG-Regulations.pdf"}]
documents = read_json('../data/files/iiit_website_content.json')[:16]
vertices, triples = create_kg(documents)
print('did, tid, pid, sid, eid, count, percentage')

100%|██████████| 100/100 [00:43<00:00,  2.31it/s]
100%|██████████| 489/489 [00:36<00:00, 13.33it/s]
  0%|          | 2/1365 [00:00<01:09, 19.67it/s]

{10000: {'text': 'academic dishonesty policy ', 'keywords': ['academic dishonesty policy'], 'tags': []}, 10001: {'text': 'plagiarism is the offence of taking undue credit for someone else’s work. ', 'keywords': ['plagiarism', 'offence', 'taking undue credit', 'someone', '’s work'], 'tags': ['number', 'credit']}, 10002: {'text': 'text plagiarism: ', 'keywords': ['text plagiarism'], 'tags': []}, 10003: {'text': 'diagram & code plagiarism: ', 'keywords': ['diagram', 'code plagiarism'], 'tags': []}, 10004: {'text': 'idea plagiarism: ', 'keywords': ['idea plagiarism'], 'tags': []}, 10005: {'text': 'auto-plagiarism: ', 'keywords': ['auto plagiarism'], 'tags': []}, 10006: {'text': 'frequently asked questions about plagiarism ', 'keywords': ['question', 'plagiarism'], 'tags': []}, 10007: {'text': 'is it okay if i am not submitting verbatim copy, but (1) modify few phrases here and there (2) rewrite some of the sentences (3) modify the solution sentence by sentence (4) copy only a small part (5

100%|██████████| 1365/1365 [00:41<00:00, 32.91it/s]
100%|██████████| 2057/2057 [02:14<00:00, 15.33it/s]


ERROR: verbatim
ERROR: should
ERROR: should
ERROR: should
ERROR: would
ERROR: would
ERROR: should
ERROR: should
ERROR: should
ERROR: should
ERROR: should
ERROR: should
ERROR: should
ERROR: should
ERROR: should
ERROR: should
ERROR: should
ERROR: should
ERROR: should
ERROR: should
ERROR: should


100%|██████████| 742/742 [00:00<00:00, 138435.73it/s]

82 10488 21364 32056 40741 54061 483 1.51
did, tid, pid, sid, eid, count, percentage





In [14]:
write_json({'vertices': vertices, 'edges': triples}, '../neo4j/iiit_website_graph.json')

In [None]:
print(nlp('graduation').similarity(nlp('pass')))

In [None]:
print(nlp('graduation')[0].lemma_)