# Multilingual Digital Story Grammar

In this notebook, we aim to implement a version of Digital Story Grammar (DSG; Bastholm Andrade & Andersen; [link](https://www.tandfonline.com/doi/abs/10.1080/13645579.2020.1723205)) that works with multiple languages (in particular Dutch, German, Danish, and English). The code will interface to the SpaCy NLP library that has pretrained and easy-to-use pipelines for many languages available.

In [279]:
""" Multilingual Digital Story Grammar """

import os
import spacy
import numpy as np
import pandas as pd
import warnings
from spacy.lang.en.examples import sentences # Change this to use different language as example; see spacy.io/models
from spacy.matcher import DependencyMatcher
from spacy import displacy

if spacy.__version__ < "3":
    warnings.warn("Module 'spacy' should be version >= 3.0 to run this notebook without errors")
if pd.__version__ < "1.0":
    warnings.warn("Module 'pandas' should be version >= 1.0 to run this notebook without errors")

In [280]:
SPACY_PIPELINE = "en_core_web_sm"

In [432]:
sentences

['Apple is looking at buying U.K. startup for $1 billion',
 'Autonomous cars shift insurance liability toward manufacturers',
 'San Francisco considers banning sidewalk delivery robots',
 'London is a big city in the United Kingdom.',
 'Where are you?',
 'Who is the president of France?',
 'What is the capital of the United States?',
 'When was Barack Obama born?']

In [740]:
examples = [
    "The bird flew over the roof.",
    "The cow ate the grass. The goat watched the cow.",
    "The cow ate the grass while the goat watched the cow.",
    "The goat watched the cow which was eating grass.",
    "The goat attempted eating the cow's grass.",
    "The cow ate grass, the cow ate butter.",
    "The grass was eaten by the cow and the goat."
]

In [725]:
def load_spacy_pipeline(name):
    try:
        nlp = spacy.load(name)
    except:
        os.system(f"spacy download {name}")
        nlp = spacy.load(name)
    return nlp

In [726]:
nlp = load_spacy_pipeline(SPACY_PIPELINE)

In [727]:
matcher = DependencyMatcher(nlp.vocab, validate=True)

patterns = [
    [
        {
            "RIGHT_ID": "verb",
            "RIGHT_ATTRS": {"POS": {"IN": ["VERB", "AUX"]}}
        },
        {
            "LEFT_ID": "verb",
            "REL_OP": ">",
            "RIGHT_ID": "subj",
            "RIGHT_ATTRS": {"DEP": "nsubj"}
        },
        {
            "LEFT_ID": "verb",
            "REL_OP": ">",
            "RIGHT_ID": "obj",
            "RIGHT_ATTRS": {"DEP": "dobj"}
        }
    ],
    [
        {
            "RIGHT_ID": "verb",
            "RIGHT_ATTRS": {"POS": {"IN": ["VERB", "AUX"]}}
        },
        {
            "LEFT_ID": "verb",
            "REL_OP": ">",
            "RIGHT_ID": "subj",
            "RIGHT_ATTRS": {"DEP": "nsubj"}
        },
        {
            "LEFT_ID": "verb",
            "REL_OP": ">>",
            "RIGHT_ID": "comp",
            "RIGHT_ATTRS": {"DEP": {"IN": ["ccomp", "xcomp", "pcomp", "advcl"]}}
        },
        {
            "LEFT_ID": "comp",
            "REL_OP": ">",
            "RIGHT_ID": "obj",
            "RIGHT_ATTRS": {"DEP": "dobj"}
        }
    ],
    [
        {
            "RIGHT_ID": "verb",
            "RIGHT_ATTRS": {"POS": {"IN": ["VERB", "AUX"]}}
        },
        {
            "LEFT_ID": "verb",
            "REL_OP": ">",
            "RIGHT_ID": "subj",
            "RIGHT_ATTRS": {"DEP": "nsubj"}
        }
    ]
]

for i, pattern in enumerate(patterns):
    matcher.add(i, [pattern])


In [728]:
def check_dict_in_list(dict_obj, dict_list):
    if dict_obj in dict_list:
        return True
    
    check = [False] * len(dict_obj.keys())
    
    for i, key in enumerate(dict_obj.keys()):
        if str(dict_obj[key]) == "":
            check[i] = True
            next
        else:
            for ref_dict in dict_list:
                if dict_obj[key].i == ref_dict[key].i:
                    check[i] = True
                    break
                
    return all(check)

In [729]:
def extract_matches(doc, matches, matcher, keys):
    matches_list = []
            
    for l, (match_id, token_ids) in enumerate(matches):
        match_dict = {}
        
        for key in keys:
            match_dict[key] = ""
                
        for k, token_id in enumerate(token_ids):
            key = matcher.get(match_id)[1][0][k]["RIGHT_ID"]
            if key in match_dict.keys():
                match_dict[key] = doc[token_id]
                
        if not check_dict_in_list(match_dict, matches_list):
            match_dict["match_id"] = match_id
            matches_list.append(match_dict)
            
    return matches_list

In [733]:
def get_subject_object_verb_table(docs, nlp, matcher):
    keys = ["verb", "subj", "obj"]
    
    docs_piped = nlp.pipe(docs)
    
    table_dict = {
        "doc_id": [],
        "sent_id": [],
        "sent": [],
        "token_id": [],
        "subj": [],
        "verb": [],
        "obj": []
    }
    
    for i, doc in enumerate(docs_piped):
        for j, sent in enumerate(doc.sents):
            
            matches = matcher(sent)
            
            matches_list = extract_matches(sent, matches, matcher, keys = keys)
            
            for l, match in enumerate(matches_list):
                table_dict["doc_id"].append(str(i))
                table_dict["sent_id"].append(str(j))
                table_dict["sent"].append(sent.text)
                table_dict["token_id"].append(str(l))
                
                for key in keys:
                    if key in table_dict.keys():
                        table_dict[key].append(match[key].text if str(match[key]) != "" else match[key])
            
            #for stree in sent.subtree:
            #    print(stree)
    
    return pd.DataFrame(table_dict)
            

In [748]:
examples

['The bird flew over the roof.',
 'The cow ate the grass. The goat watched the cow.',
 'The cow ate the grass while the goat watched the cow.',
 'The goat watched the cow which was eating grass.',
 "The goat attempted eating the cow's grass.",
 'The cow ate grass, the cow ate butter.',
 'The grass was eaten by the cow and the goat.']

In [745]:
get_subject_object_verb_table(examples[-1], nlp, matcher)

Unnamed: 0,doc_id,sent_id,sent,token_id,subj,verb,obj


In [723]:
get_subject_object_verb_table(sentences, nlp, matcher)

Unnamed: 0,doc_id,sent_id,sent,token_id,subj,verb,obj
0,0,0,Apple is looking at buying U.K. startup for $1...,0,Apple,looking,U.K.
1,1,0,Autonomous cars shift insurance liability towa...,0,cars,shift,liability
2,2,0,San Francisco considers banning sidewalk deliv...,0,Francisco,considers,robots
3,3,0,London is a big city in the United Kingdom.,0,London,is,
4,4,0,Where are you?,0,you,are,
5,6,0,What is the capital of the United States?,0,capital,is,
6,7,0,When was Barack Obama born?,0,Obama,born,


In [627]:
displacy.render(nlp(examples[2]), style='dep')

In [721]:
def check_subject_object_verb_table(docs, nlp, matcher):
    test_ref = {'doc_id': {0: '0',
  1: '1',
  2: '1',
  3: '2',
  4: '2',
  5: '3',
  6: '3',
  7: '4',
  8: '5',
  9: '5'},
 'sent_id': {0: '0',
  1: '0',
  2: '1',
  3: '0',
  4: '0',
  5: '0',
  6: '0',
  7: '0',
  8: '0',
  9: '0'},
 'sent': {0: 'The bird flew over the roof.',
  1: 'The cow ate the grass.',
  2: 'The goat watched the cow.',
  3: 'The cow ate the grass while the goat watched the cow.',
  4: 'The cow ate the grass while the goat watched the cow.',
  5: 'The goat watched the cow which was eating grass.',
  6: 'The goat watched the cow which was eating grass.',
  7: "The goat attempted eating the cow's grass.",
  8: 'The cow ate grass, the cow ate butter.',
  9: 'The cow ate grass, the cow ate butter.'},
 'token_id': {0: '0',
  1: '0',
  2: '0',
  3: '0',
  4: '1',
  5: '0',
  6: '1',
  7: '0',
  8: '0',
  9: '1'},
 'subj': {0: 'bird',
  1: 'cow',
  2: 'goat',
  3: 'cow',
  4: 'goat',
  5: 'goat',
  6: 'which',
  7: 'goat',
  8: 'cow',
  9: 'cow'},
 'verb': {0: 'flew',
  1: 'ate',
  2: 'watched',
  3: 'ate',
  4: 'watched',
  5: 'watched',
  6: 'eating',
  7: 'attempted',
  8: 'ate',
  9: 'ate'},
 'obj': {0: '',
  1: 'grass',
  2: 'cow',
  3: 'grass',
  4: 'cow',
  5: 'cow',
  6: 'grass',
  7: 'grass',
  8: 'grass',
  9: 'butter'}}
    
    test_table = get_subject_object_verb_table(docs, nlp, matcher)
    assert test_table.to_dict() == test_ref

In [722]:
check_subject_object_verb_table(examples, nlp, matcher)