# Exploration of spaCy and NLP

* Using previous sentences for coreference and time/place 
* Time ... during, before/after, on anniversary of, after EVENT, ...
* Adverbial clause modifying noun
* Causal ... when, because, because of, caused, caused by, as a result, resulted, resulted in, affected, since, due to, had effect of, therefore, so,

In [1]:
# Imports and set up
import copy
import spacy
from spacy.matcher import DependencyMatcher
from spacy.tokens import Doc, Span, Token

nlp = spacy.load('en_core_web_trf')
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x13dcf9dc0>

In [18]:
# Words that introduce a 'causal' clause, where the main clause is the effect
cause_connectors = ['when', 'because', 'since', 'as']  
# Prepositions that introduce a cause in the form of a noun phrase
cause_prepositions = ['because of', 'due to', 'as a result [of]', 'as a consequence [of]']
# Words that introduce a 'causal' effect in the main clause, where other clause is the cause
effect_connectors = ['so', 'therefore ', 'consequently ']
# If - then only is cause-effect when the tenses of the main and other clause are the same
# TODO
cause_effect_pairs = [('if', 'then')]

text = "In 1940, the Soviet Union occupied Bukovina. A year later, when Romania joined Nazi Germany "\
       "in the war against the Soviet Union, the Soviets were driven from Stanesti. Mobs then carried out "\
       "bloody attacks on the town's Jews. During the violence, I and my family fled to Czernowitz "\
       "with the aid of the local police chief. In fall of 1941, my family were forced to settle in the "\
       "Czernowitz ghetto, where living conditions were poor and they were subject to deportation "\
       "to Transnistria. In 1943, I and Beatrice escaped from the ghetto using false papers that their "\
       "father had obtained. After escaping to the Soviet Union, I and Beatrice returned to Czechoslovakia "\
       "after World War II, where they were eventually reunited with their parents. "\
       "My family was together, so I was happy."
nlp_text = nlp(text)

In [27]:
# From nlp.py 
def check_subject_in_clause(cls_sentence: Span) -> (list, list):
    # Returns a list of the verb and connector tokens
    clausal_verbs = []
    connectors = []
    # First check adverbial clauses, which will have the advmod in the advcl
    clausal_verbs = [child for child in cls_sentence.root.children if child.dep_ in ('advcl')]
    if len(clausal_verbs) > 0:
        connectors = [conn for conn in clausal_verbs[0].children if conn.dep_ in ('advmod', 'mark')]
    else:
        connectors = []
    # Also check for clausal complement, which may have the advmod in the original clause
    new_verbs = [child for child in cls_sentence.root.children if child.dep_ in ('ccomp')]
    if new_verbs:
        clausal_verbs.extend(new_verbs)
        new_connectors = [conn for conn in new_verbs[0].children if conn.dep_ in ('advmod', 'mark')]
        if not new_connectors:
            new_connectors = [conn for conn in cls_sentence.root.children if conn.dep_ in ('advmod', 'mark')]
        if new_connectors:
            connectors.extend(new_connectors)
    return clausal_verbs, connectors
    
    
    
def remove_startswith(chunk: str, connector: str) -> str:
    # Returns string removing second string from start
    if chunk.startswith(f'{connector} '):
        chunk = chunk[len(connector) + 1:]
    elif chunk.startswith(f' {connector} '):
        chunk = chunk[len(connector) + 2:]
    return chunk


def get_chunks(verb: Token, connector: list, chunk_sentence: Span, is_conj: bool) -> list:
    # Return list of strings that are the chunks
    chunks = []
    # Is there a subject of the other clause's verb? That is required to split the sentence
    # An 'expl' is the word, 'there'
    subj2 = [subj for subj in verb.children if ('subj' in subj.dep_ or subj.dep_ == 'expl')]
    seen = set()
    if len(subj2):
        # Yes ... Separate clauses
        # Get the tokens in sentence related to the 'other' verb's subtree
        seen = [ww for ww in verb.subtree if ww.pos_ != 'PUNCT']
        seen_chunk = ' '.join([ww.text for ww in seen]).strip() + '.'
        unseen = [ww for ww in chunk_sentence if ww not in seen and ww.pos_ != 'PUNCT']
        unseen_chunk = ' '.join([ww.text for ww in unseen]).strip() + '.'
        seen_first = False
        for conn in connector: 
            if is_conj:
                # Remove the connector words (to prevent cycles and sentences such as 'Mary biked to the market and')
                unseen_chunk = unseen_chunk.replace(f' {conn.text} ', ' ').replace(f' {conn.text}.', '.')
            else: 
                # Connector in the seen words and should seen words be first because they are a 
                # cause related to an effect?
                if not seen_first and conn.text in seen_chunk and conn.text in cause_connectors:
                    seen_first = True
                # Remove the connector words (to prevent cycles) but NOT if there is a single auxiliary verb 
                # If so, then the advmod is needed
                # For example, 'my daughter is home' (is = auxiliary verb, home = adverbial modifier)
                if not any([ww for ww in seen if ww.pos_ == 'AUX']) or any([ww for ww in seen if ww.pos_ == 'VERB']):
                    seen_chunk = remove_startswith(seen_chunk, conn.text) 
                    seen_chunk = seen_chunk.replace(f' {conn.text} ', ' ').replace(f' {conn.text}.', '.')
                if not any([ww for ww in unseen if ww.pos_ == 'AUX']) or any([ww for ww in unseen if ww.pos_ == 'VERB']):
                    unseen_chunk = remove_startswith(unseen_chunk, conn.text)   
        # Store the seen and unseen clauses as separate sentences
        if seen_first:
            chunks.append(seen_chunk)
            chunks.append(unseen_chunk)
        else:
            chunks.append(unseen_chunk)
            chunks.append(seen_chunk)
    if len(chunks) > 0:
        return chunks
    else:
        return [chunk_sentence.text]
    

def split_by_conjunctions(conj_sentence: Span) -> (list, str):
    # Return list of sentences (strings) and a string that is the connector (if any)
    conj_sents = []
    conj_verb = [child for child in conj_sentence.root.children if child.dep_ == 'conj']
    connectors = [conn for conn in conj_sentence.root.children if conn.dep_ == 'cc']
    if len(conj_verb) > 0:
        for chunk in get_chunks(conj_verb[0], connectors, conj_sentence, True):
            conj_sents.append(str(chunk))
    else:
        conj_sents = [conj_sentence.text]
    return conj_sents, ('' if  not connectors else connectors[0].text)
    
    
def split_clauses(sentence: Span, ce_dict: dict) -> list:
    # Return list of Spans
    orig_sents = [sentence.text]
    # Iterate until the sentences cannot be further decomposed/split
    while True:
        new_sents = []
        # Go through each sentence in the array
        for orig_sent in orig_sents:
            intermed_sents = []
            # Semi-colons automatically split sentences
            if '; ' in orig_sent:
                semicolon_index = orig_sent.index('; ')
                new_sents = [f'{orig_sent[:semicolon_index]}.', f'{orig_sent[semicolon_index + 2:]}.']
                break
            # First split by words such as 'and', 'but', 'or', ... related to the 'root' verb
            # TODO: Deal with 'or', 'nor' as the splitting word below
            for chunk_sent in nlp(orig_sent).sents:
                # nlp(orig_sent) 're-tokenizes' in order to create a new Document
                # This allows .sents to get individual sentences, since sentences have 'root' verbs 
                intermed_sents, splitting_word = split_by_conjunctions(chunk_sent)
                # Now check resulting sentences to further split by adverbial clauses and clausal complements
                # Only split if there is a subject for a related verb
                # Example: 'When I went to the store, I met George.' ('when ...' is an adverbial modifier in the clause)
                for intermed_sent in intermed_sents:
                    for chunk_sent2 in nlp(intermed_sent).sents:
                        # nlp(intermed_sent) 're-tokenizes' in order to create a new Document, as above
                        clausal_verbs, connectors = check_subject_in_clause(chunk_sent2)
                        # Is this a cause-effect connector?
                        found_cause = any([conn for conn in connectors if conn.text.lower() in cause_connectors])
                        found_effect = any([ww for ww in chunk_sent2 if ww.text.lower() in effect_connectors])
                        # Need to have both a clause and a connector/modifier for this logic
                        if len(clausal_verbs) > 0:
                            chunks = get_chunks(clausal_verbs[0], connectors, chunk_sent2, False)
                            if found_cause:
                                # Save cause-effect
                                ce_dict[chunks[0]] = chunks[1]
                                new_sents.append(chunks[1])
                            elif found_effect:
                                # Save cause-effect
                                ce_dict[chunks[1]] = chunks[0]
                                new_sents.append(chunks[0])
                            else:
                                new_sents.extend(chunks)
                        else:
                            new_sents.append(intermed_sent)
                
        # Check if the processing has resulted in new sentence clauses
        if len(orig_sents) == len(new_sents):
            # If not, break out of the while loop
            break
        else:
            # New clauses, so keep processing
            orig_sents = copy.deepcopy(new_sents)
    return new_sents

In [28]:
nlp_sentences = []
cause_effect_dict = dict()
for sentence in nlp_text.sents:
    # Sentence splitting
    for sent in split_clauses(sentence, cause_effect_dict):
        sent_nlp = nlp(sent)
        # Determine the spans of individual nouns and noun chunks
        spans = list(sent_nlp.ents) + list(sent_nlp.noun_chunks)  
        spans = spacy.util.filter_spans(spans)
        # Reset the sentence parse to maintain chunks
        with sent_nlp.retokenize() as retokenizer:
            [retokenizer.merge(span, attrs={'tag': span.root.tag,
                                            'dep': span.root.dep}) for span in spans]
        # Store new sentence details
        nlp_sentences.append(sent_nlp)


In [29]:
print(nlp_sentences)
print()
print(cause_effect_dict)

[In 1940, the Soviet Union occupied Bukovina., A year later the Soviets were driven from Stanesti., Mobs then carried out bloody attacks on the town's Jews., During the violence, I and my family fled to Czernowitz with the aid of the local police chief., In fall of 1941, my family were forced to settle in the Czernowitz ghetto, where living conditions were poor and they were subject to deportation to Transnistria., In 1943, I and Beatrice escaped from the ghetto using false papers that their father had obtained., After escaping to the Soviet Union I and Beatrice returned to Czechoslovakia after World War II., they were reunited with their parents., so I was happy.]

{'Romania joined Nazi Germany in the war against the Soviet Union.': 'A year later the Soviets were driven from Stanesti.', 'My mother was home.': 'so I was happy.'}
