In [1]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [2]:
# import wikipedia sentences
candidate_sentences = pd.read_csv("wiki_sentences_v2.csv")
candidate_sentences.shape

(4318, 1)

In [5]:
candidate_sentences['sentence'].sample(5)

1488                                                                                                  g. george, sathyan anthikad, priyadarshan,
424                                                                   i is a 2015 indian tamil-language film written and directed by s. shankar.
1022    main examples include: the morning after , shattered , the long kiss goodnight , memento , the bourne film series , and shutter island .
775                                                                              color photography became more common from the mid-20th century.
953                                                                                winsor mccay's little nemo  showcased very detailed drawings.
Name: sentence, dtype: object

In [6]:
# Let’s check the subject and object of one of these sentences. Ideally, there should be one subject and one object in the sentence
doc = nlp("the drawdown process is governed by astm standard d823")

for tok in doc:
  print(tok.text, "...", tok.dep_)

the ... det
drawdown ... amod
process ... nsubjpass
is ... auxpass
governed ... ROOT
by ... agent
astm ... compound
standard ... amod
d823 ... pobj


In [9]:
"""
The main idea is to go through a sentence and extract the subject and the object as and when they are encountered.
However, there are a few challenges ⁠— an entity can span across multiple words, eg., “red wine”, and the dependency
parsers tag only the individual words as subjects or objects.
"""

def get_entities(sent):
    """
    prv_tok_dep and prv_tok_text will hold the dependency tag of the previous word in the sentence and that previous word itself, respectively. 
    prefix and modifier will hold the text that is associated with the subject or the object.
    """
    ent1 = ent2 = prv_tok_dep = prv_tok_text = prefix = modifier = str()

    for tok in nlp(sent):
        # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
            # check: token is a compound word or not
            # A compound word is a combination of multiple words linked to form a word with a new meaning (example – “Football Stadium”, “animal lover”).
            if tok.dep_ == "compound":
                prefix = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    prefix = prv_tok_text + " " + tok.text

            # check: token is a modifier or not
            if tok.dep_.endswith("mod") == True:
                modifier = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    modifier = prv_tok_text + " " + tok.text

            ## chunk 3
            if tok.dep_.find("subj") == True:
                ent1 = modifier + " " + prefix + " " + tok.text
                prefix = modifier = prv_tok_dep = prv_tok_text = str()   

            ## chunk 4
            if tok.dep_.find("obj") == True:
                ent2 = modifier + " " + prefix + " " + tok.text

            ## chunk 5  
            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text

    return [ent1.strip(), ent2.strip()]

get_entities("the film had 200 patents")

ValueError: not enough values to unpack (expected 6, got 0)