In [1]:
#!pip install spacy
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
import spacy
spacy_en = spacy.load('en')

# 1: Extract a path of dependency relations from the ROOT to a token



In [8]:
def get_dep_relations(sent):
  #Turn it into a Doc
  doc = spacy_en(sent)

  paths = {}
  #Look at each token
  for tok in doc:
    key = tok.text
    path = [tok.text+" ("+tok.dep_+")"]
    while True:
      if(tok.dep_=="ROOT"):
        break
      tok = tok.head
      path.append(tok.text+" ("+tok.dep_+")")
    path.reverse()
    paths[key] = path
  return paths

# 2: Extract subtree of a dependent given a token

In [4]:
def get_subtrees(sent):
  #Turn it into a Doc
  doc = spacy_en(sent)

  subtrees = {}
  #Look at each token
  for tok in doc:
    #stree = list(tok.subtree)
    #subtrees.append(stree)
    subtrees[tok.text] = list(tok.subtree)
  return subtrees

# 3: Check if a given list of tokens (segment of a sentence) forms a subtree

In [5]:
#Inputs are: the sentence (string) and the words/tokens (list of strings)
def is_subtree(sent, list_words):
  #Turn it into a Doc
  doc = spacy_en(sent)
  nb_words = len(list_words)

  #Look at each token
  for tok in doc:
    #If the token isn't part of the list given 
    #or if the size of the subtree isn't exactly the size of our list of words, 
    #go to the next token
    if not(tok.text in list_words) or len(list(tok.subtree)) != nb_words:
      continue

    #Else, check if the subtree contains all of our words, once and only once
    temp_list = list_words.copy()
    for t in tok.subtree:
      if t.text in temp_list:
        #Remove the word from the list so it can't be matched anymore
        temp_list.remove(t.text)
      else:
        break
      if len(temp_list)==0:
        return True
  return False


# 4: Identify head of a span, given its tokens

In [6]:
def find_head_in_span(list_tok):
  #Merge the list and turn it into a Doc
  doc = spacy_en(' '.join(list_tok))
  #Get a Span out of it
  span = doc[:]
  #Return the head
  return span.root

# 5: Extract sentence subject, direct object and indirect object spans

In [7]:
def get_subj_dirobj_indobj(sent):
  #Turn it into a Doc
  doc = spacy_en(sent)
  subj_dep = ["csubj", "csubjpass", "nsubj", "nsubjpass"]

  result = {
    "Subject": [],
    "Direct object": [],
    "Indirect object": []
  }

  for tok in doc:
    #Look at dependency label of the token
    #If it is one that we want, get its span (see subtree limits) and add it to the correct list
    #Subject: csubj, csubjpass, nsubj, nsubjpass
    #Direct object: dobj
    #Indirect object: dative
    if tok.dep_=="dobj":
      span = doc[tok.i-tok.n_lefts:tok.i+tok.n_rights+1]
      result["Direct object"].append(span)
    elif tok.dep_=="dative":
      span = doc[tok.i-tok.n_lefts:tok.i+tok.n_rights+1]
      result["Indirect object"].append(span)
    elif tok.dep_ in subj_dep:
      span = doc[tok.i-tok.n_lefts:tok.i+tok.n_rights+1]
      result["Subject"].append(span)
  return result