# Multilingual Digital Story Grammar

In this notebook, we aim to implement a version of Digital Story Grammar (DSG; Bastholm Andrade & Andersen; [link](https://www.tandfonline.com/doi/abs/10.1080/13645579.2020.1723205)) that works with multiple languages (in particular Dutch, German, Danish, and English). The code will interface to the SpaCy NLP library that has pretrained and easy-to-use pipelines for many languages available.

In [271]:
""" Multilingual Digital Story Grammar """

import os
import json
import spacy
import numpy as np
import pandas as pd
import warnings
# Change this to use different language as example; see spacy.io/models
from spacy.lang.nl.examples import sentences
from spacy.matcher import DependencyMatcher
from spacy import displacy

if spacy.__version__ < "3":
    warnings.warn(
        "Module 'spacy' should be version >= 3.0 to run this notebook without errors")
if pd.__version__ < "1.0":
    warnings.warn(
        "Module 'pandas' should be version >= 1.0 to run this notebook without errors")


In [272]:
SPACY_PIPELINE = "nl_core_news_sm"  # "en_core_web_sm"
DEPENDENCY_PATTERN_FILE = "multilingual_dsg_patterns_nl.json"


In [273]:
# Define a few test sentences

examples = [
    "The bird flew over the roof.",
    "The cow ate the grass. The goat watched the cow.",
    "The cow ate the grass while the goat watched the cow.",
    "The goat watched the cow which was eating grass.",
    "The goat attempted eating the cow's grass.",
    "The cow ate grass, the cow ate butter.",
    "The cow and the goat ate grass.",
    "The grass was eaten by the cow, the goat and the bird."
]


In [274]:
def load_spacy_pipeline(name):
    """Check if the spacy language pipeline was downloaded and load it.

    Args:
        name (string): Name of the spacy language.

    Returns:
        spacy.language.Language: The spacy language pipeline
    """
    try:
        nlp = spacy.load(name)
    except:
        os.system(f"spacy download {name}")
        nlp = spacy.load(name)
    return nlp


In [275]:
def check_dict_in_list(dict_obj, dict_list):
    """Check if a dictionary (partially) matches a list of dictionaries.

    Args:
        dict_obj (dict): A dictionary object.
        dict_list (list): A list of dictionary objects.

    Returns:
        bool: True if all non-empty items in dict_obj match the items in any dictionary objects in dict_list, otherwise False.
    """
    if dict_obj in dict_list:
        return True

    check = [False] * len(dict_obj.keys())

    for i, key in enumerate(dict_obj.keys()):
        if str(dict_obj[key]) == "_":
            check[i] = True
            next
        else:
            for ref_dict in dict_list:
                if dict_obj[key].i == ref_dict[key].i:
                    check[i] = True
                    break

    return all(check)


In [276]:
def extract_matches(doc, matches, matcher, nlp, keys):
    """Check if a dictionary (partially) matches a list of dictionaries.

    Args:
        doc (spacy.tokens.Doc): A spacy doc object as returned by a spacy language pipeline.
        matches (list): A list of (match_id, token_ids) tuples as returned by a spacy dependency matcher.
        matcher (spacy.matcher.DependencyMatcher): A spacy dependency matcher object.
        nlp (spacy.language.Language): A spacy language pipeline.
        keys (list): A list of keys to which the dependcy matches are assigned.

    Returns:
        list: A list of dictionaries that each contain a match of the dependency matcher. 
            Has the same keys as the `keys` argument. Empty keys contain a spacy token with text='_'.
    """
    matches_list = []

    for l, (match_id, token_ids) in enumerate(matches):
        match_dict = {}

        for key in keys:
            match_dict[key] = nlp("_")[0]

        for k, token_id in enumerate(token_ids):
            key = matcher.get(match_id)[1][0][k]["RIGHT_ID"]
            if key in match_dict.keys():
                match_dict[key] = doc[token_id]

        if not check_dict_in_list(match_dict, matches_list):
            match_dict["match_id"] = match_id
            matches_list.append(match_dict)

    return matches_list


In [277]:
def create_matcher(nlp, pattern_file):
    """Create a spacy dependency matcher.

    Args:
        nlp (spacy.language.Language): A spacy language pipeline.
        pattern_file (str): The path to the dependency pattern .json file for the matcher.

    Returns:
        spacy.matcher.DependencyMatcher: A spacy dependency matcher object.
    """
    matcher = DependencyMatcher(nlp.vocab, validate=True)

    with open(pattern_file, "r") as file:
        patterns = json.load(file)

    for i, pattern in enumerate(patterns):
        matcher.add(i, [pattern])

    return matcher


In [278]:
nlp = load_spacy_pipeline(SPACY_PIPELINE)

matcher = create_matcher(nlp, DEPENDENCY_PATTERN_FILE)


In [279]:
def append_children_deps(token, doc, children_deps):
    """Append children to a token based on dependency tag.

    Args:
        token (spacy.token.Token): A spacy token object.
        doc (spacy.token.Doc): A spacy doc object that includes the token.
        children_deps (list): A list of dependency tags.

    Returns:
        spacy.token.Token: A span of spacy tokens (token argument plus children with specified dependency tags) if token argument is non-empty, the token argument otherwise.
    """
    if str(token) != "_":
        children_match_idx = [
            child.i for child in token.children if child.dep_ in children_deps] + [token.i]

        span = doc[min(children_match_idx):max(children_match_idx)+1]

        return span
    else:
        return token


In [280]:
def get_subject_object_verb_table(docs, nlp, matcher, keys=["verb", "subj", "obj"]):
    """Construct a pandas dataframe with subjects, verbs, and objects per sentence of documents.

    Args:
        docs (list): A list of text strings.
        nlp (spacy.language.Language): A spacy language pipeline.
        matcher (spacy.matcher.DependencyMatcher): A spacy dependency matcher object.
        nlp (spacy.language.Language): A spacy language pipeline.
        keys (list): A list of keys to which the dependcy matches are assigned. 
            Defaults to subjects, verbs, and objects.

    Returns:
        pandas.DataFrame: A dataframe with a row for each match of the dependency matcher and cols:
            doc_id (str): Index of the document in the document list.
            sent_id (str): Index of the sentence in the document.
            sent (spacy.tokens.Span): A spacy span object with the sentence.
            match_id (str): Index of the match in the sentence.

            For each key in the `keys` argument:
            key (spacy.tokens.Token): A spacy token object that matches the dependency matcher patterns.
    """
    docs_piped = nlp.pipe(docs)

    table_dict = {
        "doc_id": [],
        "sent_id": [],
        "sent": [],
        "match_id": [],
        "subj": [],
        "verb": [],
        "obj": []
    }

    for i, doc in enumerate(docs_piped): # i: doc index
        for j, sent in enumerate(doc.sents): # j: sent index

            matches = matcher(sent)

            matches_list = extract_matches(
                sent, matches, matcher, nlp, keys=keys)

            for l, match in enumerate(matches_list): # l: match index
                table_dict["doc_id"].append(str(i))
                table_dict["sent_id"].append(str(j))
                table_dict["sent"].append(sent.text)
                table_dict["match_id"].append(str(l))

                for key in keys:
                    table_dict[key].append(append_children_deps(
                        match[key], doc, ["compound", "flat"]).text)

                    # Check for conjuncts, and add table row for each
                    for conj in match[key].conjuncts:
                        table_dict["doc_id"].append(str(i))
                        table_dict["sent_id"].append(str(j))
                        table_dict["sent"].append(sent.text)
                        table_dict["match_id"].append(str(l))
                        table_dict[key].append(conj.text)
                        for key_conj in keys:
                            if key != key_conj:
                                table_dict[key_conj].append(
                                    match[key_conj].text)

    return pd.DataFrame(table_dict)


In [281]:
examples


['The bird flew over the roof.',
 'The cow ate the grass. The goat watched the cow.',
 'The cow ate the grass while the goat watched the cow.',
 'The goat watched the cow which was eating grass.',
 "The goat attempted eating the cow's grass.",
 'The cow ate grass, the cow ate butter.',
 'The cow and the goat ate grass.',
 'The grass was eaten by the cow, the goat and the bird.']

In [282]:
get_subject_object_verb_table(examples, nlp, matcher)


Unnamed: 0,doc_id,sent_id,sent,match_id,subj,verb,obj


In [283]:
sentences


['Apple overweegt om voor 1 miljard een U.K. startup te kopen',
 "Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten",
 'San Francisco overweegt robots op voetpaden te verbieden',
 'Londen is een grote stad in het Verenigd Koninkrijk']

In [284]:
get_subject_object_verb_table(sentences, nlp, matcher)


Unnamed: 0,doc_id,sent_id,sent,match_id,subj,verb,obj
0,0,0,Apple overweegt om voor 1 miljard een U.K. sta...,0,Apple,overweegt,U.K.
1,0,0,Apple overweegt om voor 1 miljard een U.K. sta...,1,Apple,overweegt,miljard
2,1,0,Autonome auto's verschuiven de verzekeringvera...,0,Autonome,verschuiven,verzekeringverantwoordelijkheid
3,1,0,Autonome auto's verschuiven de verzekeringvera...,1,Autonome,verschuiven,producenten
4,2,0,San Francisco overweegt robots op voetpaden te...,0,San,overweegt,voetpaden


In [288]:
displacy.render(nlp(sentences[2]), style='dep')


In [286]:
def check_subject_object_verb_table(docs, nlp, matcher):
    test_ref = {'doc_id': {0: '0',
                           1: '1',
                           2: '1',
                           3: '2',
                           4: '2',
                           5: '3',
                           6: '3',
                           7: '4',
                           8: '5',
                           9: '5'},
                'sent_id': {0: '0',
                            1: '0',
                            2: '1',
                            3: '0',
                            4: '0',
                            5: '0',
                            6: '0',
                            7: '0',
                            8: '0',
                            9: '0'},
                'sent': {0: 'The bird flew over the roof.',
                         1: 'The cow ate the grass.',
                         2: 'The goat watched the cow.',
                         3: 'The cow ate the grass while the goat watched the cow.',
                         4: 'The cow ate the grass while the goat watched the cow.',
                         5: 'The goat watched the cow which was eating grass.',
                         6: 'The goat watched the cow which was eating grass.',
                         7: "The goat attempted eating the cow's grass.",
                         8: 'The cow ate grass, the cow ate butter.',
                         9: 'The cow ate grass, the cow ate butter.'},
                'token_id': {0: '0',
                             1: '0',
                             2: '0',
                             3: '0',
                             4: '1',
                             5: '0',
                             6: '1',
                             7: '0',
                             8: '0',
                             9: '1'},
                'subj': {0: 'bird',
                         1: 'cow',
                         2: 'goat',
                         3: 'cow',
                         4: 'goat',
                         5: 'goat',
                         6: 'which',
                         7: 'goat',
                         8: 'cow',
                         9: 'cow'},
                'verb': {0: 'flew',
                         1: 'ate',
                         2: 'watched',
                         3: 'ate',
                         4: 'watched',
                         5: 'watched',
                         6: 'eating',
                         7: 'attempted',
                         8: 'ate',
                         9: 'ate'},
                'obj': {0: '',
                        1: 'grass',
                        2: 'cow',
                        3: 'grass',
                        4: 'cow',
                        5: 'cow',
                        6: 'grass',
                        7: 'grass',
                        8: 'grass',
                        9: 'butter'}}

    test_table = get_subject_object_verb_table(docs, nlp, matcher)
    assert test_table.to_dict() == test_ref


In [287]:
check_subject_object_verb_table(examples, nlp, matcher)


AssertionError: 