In [21]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm
import nltk
import pdfplumber
import joblib
pd.set_option('display.max_colwidth', 200)
%matplotlib inline

import pandas as pd #for handling csv and csv contents
from rdflib import Graph, Literal, RDF, URIRef, Namespace #basic RDF handling
from rdflib.namespace import FOAF , XSD #most common namespaces
import urllib.parse #for parsing strings to URI's
import warnings
warnings.filterwarnings("ignore")


# Lets get the data and preprocess it

## Loading Data

In [2]:
def load_data(deb, fin):
    pdf = pdfplumber.open('docs/PMBOK 5th.pdf')
    text=''
    for i in range(deb,fin):
        page = pdf.pages[i]
        text += page.extract_text()
    return text

In [3]:
P1 = load_data(334, 344)

P2 = load_data(344, 353)

P3 = load_data(353, 359)

P4 = load_data(359, 368)

P5 = load_data(367, 374)

P6 = load_data(374, 381)

## Cleaning The Data

In [4]:
def clean(text):
    # Remove numbers
    text = re.sub('[\d]+','',text)
    text = re.sub('[\t\n\r\f\v]',' ',text)
    text = re.sub('[•-]','',text)
    text = re.sub("\s\s+" , " ", text)
    text = text.replace('&',"and").replace('©',"").replace('—The'," is the").lower()
    text = text.replace('  project management institute. a guide to the project management body of knowledge (pmbok® guide) – fifth edition licensed to: jorge diego fuentes sanchez pmi memberid: this copy is a pmi member benefit, not for distribution, sale, or reproduction.','')
    return text

In [5]:
P1_clean = clean(P1)

P2_clean = clean(P2)

P3_clean = clean(P3)

P4_clean = clean(P4)

P5_clean = clean(P5)

P6_clean = clean(P6)

# Now we have to split our data into sentences

In [8]:
P1_SENT = nltk.tokenize.sent_tokenize(P1_clean)
P2_SENT = nltk.tokenize.sent_tokenize(P2_clean)
P3_SENT = nltk.tokenize.sent_tokenize(P3_clean)
P4_SENT = nltk.tokenize.sent_tokenize(P4_clean)
P5_SENT = nltk.tokenize.sent_tokenize(P5_clean)
P6_SENT = nltk.tokenize.sent_tokenize(P6_clean)

In [9]:
df1 = pd.DataFrame({'sentence': P1_SENT})
df2 = pd.DataFrame({'sentence': P2_SENT})
df3 = pd.DataFrame({'sentence': P3_SENT})
df4 = pd.DataFrame({'sentence': P4_SENT})
df5 = pd.DataFrame({'sentence': P5_SENT})
df6 = pd.DataFrame({'sentence': P6_SENT})

# Entity and Relation Extraction

## Extracting Entities

In [10]:
def get_entities(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""  # dependency tag of previous token in the sentence
    prv_tok_text = ""  # previous token in the sentence

    prefix = ""
    modifier = ""

    #############################################################

    for tok in nlp(sent):
        ## chunk 2
        # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
            # check: token is a compound word or not
            if tok.dep_ == "compound":
                prefix = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    prefix = prv_tok_text + " " + tok.text

            # check: token is a modifier or not
            if tok.dep_.endswith("mod") == True:
                modifier = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    modifier = prv_tok_text + " " + tok.text

            ## chunk 3
            if tok.dep_.find("subj") == True:
                ent1 = modifier + " " + prefix + " " + tok.text
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            ## chunk 4
            if tok.dep_.find("obj") == True:
                ent2 = modifier + " " + prefix + " " + tok.text

            ## chunk 5
            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text
    #############################################################

    return [ent1.strip(), ent2.strip()]

Now we can use this function to extract these entity pairs for all the sentences in our data:

In [11]:
def make_entity(df):
    entity_pairs = []
    for i in df["sentence"]:
        entity_pairs.append(get_entities(i))
    return entity_pairs

In [12]:
entity_pairs1 = make_entity(df1)
entity_pairs2 = make_entity(df2)
entity_pairs3 = make_entity(df3)
entity_pairs4 = make_entity(df4)
entity_pairs5 = make_entity(df5)
entity_pairs6 = make_entity(df6)

## Relation / Predicate Extraction

In [14]:
def get_relation(sent):

    doc = nlp(sent)

    # Matcher class object
    matcher = Matcher(nlp.vocab)

    #define the pattern
    pattern = [ {'DEP':'ROOT'}, 
                {'DEP':'prep','OP':"?"},
                {'DEP':'agent','OP':"?"},  
                {'POS':'ADJ','OP':"?"} ] 

    matcher.add("matching_1", None, pattern)

    matches = matcher(doc)
    k = len(matches) - 1

    span = doc[matches[k][1]:matches[k][2]]

    return (span.text)

In [16]:
relations1 = [get_relation(i) for i in df1['sentence']]
relations2 = [get_relation(i) for i in df2['sentence']]
relations3 = [get_relation(i) for i in df3['sentence']]
relations4 = [get_relation(i) for i in df4['sentence']]
relations5 = [get_relation(i) for i in df5['sentence']]
relations6 = [get_relation(i) for i in df6['sentence']]

# Building our Datasets [*'subject'* , *'relation'* , *'object'*]

In [17]:
subject1 = [i[0] for i in entity_pairs1]
subject2 = [i[0] for i in entity_pairs2]
subject3 = [i[0] for i in entity_pairs3]
subject4 = [i[0] for i in entity_pairs4]
subject5 = [i[0] for i in entity_pairs5]
subject6 = [i[0] for i in entity_pairs6]

target1 = [i[1] for i in entity_pairs1]
target2 = [i[1] for i in entity_pairs2]
target3 = [i[1] for i in entity_pairs3]
target4 = [i[1] for i in entity_pairs4]
target5 = [i[1] for i in entity_pairs5]
target6 = [i[1] for i in entity_pairs6]

df1 = pd.DataFrame({'subject':subject1, 'property':relations1, 'object':target1 })
df2 = pd.DataFrame({'subject':subject2, 'property':relations2, 'object':target2 })
df3 = pd.DataFrame({'subject':subject3, 'property':relations3, 'object':target3 })
df4 = pd.DataFrame({'subject':subject4, 'property':relations4, 'object':target4 })
df5 = pd.DataFrame({'subject':subject5, 'property':relations5, 'object':target5 })
df6 = pd.DataFrame({'subject':subject6, 'property':relations6, 'object':target6 })

# Ontology

## Drop Nan Values

In [None]:
df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()
df4 = df4.dropna()
df5 = df5.dropna()
df6 = df6.dropna()

## Creating Namesapeces

In [22]:
g = Graph()
ppl = Namespace('http://example.org/people/')
loc = Namespace('http://mylocations.org/addresses/')
schema = Namespace('http://schema.org/')

In [None]:
def process(df):
    df['subject'] = df['subject'].apply(lambda x: x.replace(' ','_'))
    df['property'] = df['property'].apply(lambda x: x.replace(' ','_'))
    df['object'] = df['object'].apply(lambda x: x.replace(' ','_'))
    return df

In [None]:
df1 = process(df1)
df2 = process(df2)
df3 = process(df3)
df4 = process(df4)
df5 = process(df5)
df6 = process(df6)

## Creating Our Ontology

In [None]:
def create_ontology(df,name):
    for index, row in df.iterrows():
        if name == 'Person':
            g.add((URIRef(ppl+row['subject']), RDF.type, FOAF.Person))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'subject'), Literal(row['subject'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'property'), Literal(row['property'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'object'), Literal(row['object'], datatype=XSD.string) ))
            g.add((URIRef(loc+urllib.parse.quote(row['object'])), URIRef(schema+'subject'), Literal(row['property'], datatype=XSD.string) ))
        if name == 'Document':
            g.add((URIRef(ppl+row['subject']), RDF.type, FOAF.Document))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'subject'), Literal(row['subject'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'property'), Literal(row['property'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'object'), Literal(row['object'], datatype=XSD.string) ))
            g.add((URIRef(loc+urllib.parse.quote(row['object'])), URIRef(schema+'subject'), Literal(row['property'], datatype=XSD.string) ))
        elif name == 'Agent':
            g.add((URIRef(ppl+row['subject']), RDF.type, FOAF.Agent))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'subject'), Literal(row['subject'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'property'), Literal(row['property'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'object'), Literal(row['object'], datatype=XSD.string) ))
            g.add((URIRef(loc+urllib.parse.quote(row['object'])), URIRef(schema+'subject'), Literal(row['property'], datatype=XSD.string) ))
        elif name == 'Image':
            g.add((URIRef(ppl+row['subject']), RDF.type, FOAF.Image))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'subject'), Literal(row['subject'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'property'), Literal(row['property'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'object'), Literal(row['object'], datatype=XSD.string) ))
            g.add((URIRef(loc+urllib.parse.quote(row['object'])), URIRef(schema+'subject'), Literal(row['property'], datatype=XSD.string) ))
        elif name == 'Organization':
            g.add((URIRef(ppl+row['subject']), RDF.type, FOAF.Organization))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'subject'), Literal(row['subject'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'property'), Literal(row['property'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'object'), Literal(row['object'], datatype=XSD.string) ))
            g.add((URIRef(loc+urllib.parse.quote(row['object'])), URIRef(schema+'subject'), Literal(row['property'], datatype=XSD.string) ))
        elif name == 'Project':
            g.add((URIRef(ppl+row['subject']), RDF.type, FOAF.Project))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'subject'), Literal(row['subject'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'property'), Literal(row['property'], datatype=XSD.string) ))
            g.add((URIRef(ppl+row['subject']), URIRef(schema+'object'), Literal(row['object'], datatype=XSD.string) ))
            g.add((URIRef(loc+urllib.parse.quote(row['object'])), URIRef(schema+'subject'), Literal(row['property'], datatype=XSD.string) ))

In [None]:
create_ontology(df1,'Person')
create_ontology(df2,'Document')
create_ontology(df3,'Agent')
create_ontology(df4,'Image')
create_ontology(df5,'Organization')
create_ontology(df6,'Project')

In [None]:
g.serialize('ontoD.owl',format='turtle')

![caption](files/giphy.gif)