<a href="https://colab.research.google.com/github/paslaski/009-IntroDH/blob/main/dh_gendered_lang_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python -m spacy download en_core_web_sm

import spacy
from nltk import Tree
from collections import Counter

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
en_nlp = spacy.load('en')

doc = en_nlp("The quick brown fox jumps over the lazy dog.")

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_


my_tree = [to_nltk_tree(sent.root) for sent in doc.sents]
# my_tree = [to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

In [None]:
[x.pretty_print() for x in my_tree]
my_tree

                jumps                  
  ________________|____________         
 |    |     |     |    |      over     
 |    |     |     |    |       |        
 |    |     |     |    |      dog      
 |    |     |     |    |    ___|____    
The quick brown  fox   .  the      lazy



[Tree('jumps', ['The', 'quick', 'brown', 'fox', Tree('over', [Tree('dog', ['the', 'lazy'])]), '.'])]

In [None]:
from __future__ import unicode_literals, print_function
from spacy.lang.en import English # updated

# from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
# from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL

raw_text = 'The fast woman jumped over the hoop. Hello, world. Here are two sentences.'
# subtree (includes self): [The, fast, woman]
# ancestors: [jumped]

raw_text = 'The fast woman is a doctor. They saw me last Saturday. Hello, world. Here are two sentences.'
# subtree (includes self): [The, fast, woman]
# ancestors: [is]

nlp = spacy.load("en_core_web_sm")
# nlp = English()

# nlp.add_pipe(nlp.create_pipe('parser')) # updated
# nlp.add_pipe(nlp.create_pipe('sentencizer'))

# tagger_config = {"model": DEFAULT_TAGGER_MODEL}
# nlp.add_pipe("tagger", config=tagger_config)

# parser_config = {
#    "moves": None,
#    "update_with_oracle_cut_size": 100,
#    "learn_tokens": False,
#    "min_action_freq": 30,
#    "model": DEFAULT_PARSER_MODEL,
# }
# nlp.add_pipe("parser", config=parser_config)

# sentences = [sent.string.strip() for sent in doc.sents]
# sentences

doc = nlp(raw_text)
for token in doc:
    print(token.text, token.tag_, token.head.text, token.dep_)

she_token = doc[2]
ancestors = [x for x in she_token.ancestors]
print(ancestors)
[x for x in ancestors[0].subtree]
# subtree (includes self): [The, fast, woman]
# ancestors: [is]

# they_token = doc[7]
# [x for x in they_token.ancestors]

The DT woman det
fast JJ woman amod
woman NN is nsubj
is VBZ is ROOT
a DT doctor det
doctor NN is attr
. . is punct
They PRP saw nsubj
saw VBD saw ROOT
me PRP saw dobj
last JJ Saturday amod
Saturday NNP saw npadvmod
. . saw punct
Hello UH Hello ROOT
, , Hello punct
world NN Hello npadvmod
. . Hello punct
Here RB are advmod
are VBP are ROOT
two CD sentences nummod
sentences NNS are nsubj
. . are punct
[is]


[The, fast, woman, is, a, doctor, .]

In [21]:
fem_words = set(["her", "hers", "herself", "she", "woman", "women", "mother", "mothers", "mom", "moms", "sister", "sisters", "wife", "wives", "aunt", "aunts", "womanly", "goddess", "goddesses", "girl", "girls", "queen", "queens"])

gn_words = set(["their", "zir", "theirs", "zirs", "themself", "theirself", "they", "ze", "people", "persons", "person", "parent", "parents", "spouse"])

masc_words = set(["he", "his", "hisself", "he", "man", "men", "father", "fathers", "dad", "dads", "brother", "brothers", "husband", "husbands", "uncle", "uncles", "manly", "god", "gods", "boy", "boys", "boi", "bruh", "king", "kings"])

In [10]:
def extract_gendered_deps(sentence, nlp, fem_words, gn_words, masc_words):
# accepts: 
    # sentence: a single sentence to parse
    # nlp: a spacy model for dependency parsing
    # fem_words, gn_words, masc_words, dictionaries containing gendered words
# returns:
    # tuple of Counter objects: (fem, gn, masc)
    # each contains unique adjectives, nouns, aux, verbs related to feminine-gendered, 
    # masculine-gendered, non-gendered words/pronouns defined by input dictionaries

    # intialize Counter objects to return
    fem, gn, masc = Counter(), Counter(), Counter()

    # use spacy to perform dependency parsing
    doc = nlp(sentence)

    for token in doc: # iterate through all tokens in sentence
        # check each word against gendered language dictionaries 
        if token.text.lower() in fem_words:
            extract_ancestors(token, fem)
        elif token.text.lower() in gn_words:
            extract_ancestors(token, gn)
        elif token.text.lower() in masc_words:
            extract_ancestors(token, masc)

    return (fem, gn, masc)

In [13]:
def extract_ancestors(token, ancestor_ctr):
# accepts:
#         token: spacy token, contains references to ancestors
#         ancestor_ctr: Counter() object, hashmap of counts of word occurences
# returns:
#         ancestor_ctr: list of adjectives, nouns, verbs, auxiliary words related to token


    ## TODO: double check we get all possible associated words (expand ancestors prolly ok?)

    ancestors = set()

    accepted_pos_types = {'ADJ': 1, 'NOUN': 1, 'VERB': 1, 'AUX': 1}

    # iterate over ancestors
    for word in token.ancestors:
        if word.pos_ in accepted_pos_types:
            ancestors.add(word.text.lower())

        # will we need some sorta fancy recursive solution, cycles?
        for deeper_word in word.subtree:
            if deeper_word.pos_ in accepted_pos_types:
                ancestors.add(deeper_word.text.lower())
          
    # iterate over subtree
    for word in token.subtree:
        if word.pos_ in accepted_pos_types:
            ancestors.add(word.text.lower())

    # for each unique word (set) increment counter object once
    for accepted_word in ancestors:
        ancestor_ctr[accepted_word] += 1

    return ancestor_ctr

In [23]:
nlp = spacy.load("en_core_web_sm")

raw_text = 'The fast woman jumped over the hoop. You go girl! Hello, world. Here are two sentences. He swang the bat. Are they on your team?'

fem, gn, masc = extract_gendered_deps(raw_text, nlp, fem_words, gn_words, masc_words)
fem, gn, masc

(Counter({'fast': 1, 'girl': 1, 'go': 1, 'hoop': 1, 'jumped': 1, 'woman': 1}),
 Counter({'are': 1, 'team': 1}),
 Counter({'bat': 1, 'swang': 1}))

In [None]:
def append_list_for_me(sentence, input_list):
    words = [x for x in sentence.split(' ')]
    for word in words:
        input_list.append(word)

    return input_list

sentence = 'the quick brown fox jumped over the lazy dog'
input_list = []

append_list_for_me(sentence, input_list)
append_list_for_me(sentence, input_list)
append_list_for_me(sentence, input_list)
input_list

['the',
 'quick',
 'brown',
 'fox',
 'jumped',
 'over',
 'the',
 'lazy',
 'dog',
 'the',
 'quick',
 'brown',
 'fox',
 'jumped',
 'over',
 'the',
 'lazy',
 'dog',
 'the',
 'quick',
 'brown',
 'fox',
 'jumped',
 'over',
 'the',
 'lazy',
 'dog']