In [2]:
import pandas as pd
import numpy as np
import csv
import json
from allennlp_models.common.ontonotes import Ontonotes

In [3]:
onto = Ontonotes()
ds = onto.dataset_iterator('../code_base/ontonotes_data/train/data/english/annotations')
all_sentences = []
all_sentences_entities = []
all_sentences_pos = []
for x in ds:
    sentences = x.words
    pos_tags = x.pos_tags
    all_sentences.append(sentences)
    all_sentences_pos.append(pos_tags)

In [4]:
len(all_sentences)

115812

In [5]:
# Extract sentences that contain gender pronouns
gendered_sentences = []
pronoun_list = ["he" "him", "his", "her", "she"]
for idx, sentence in enumerate(all_sentences):
    for word in sentence:
        if word in pronoun_list:
            # gendered pronoun present
            gendered_sentences.append((idx, sentence))
            break

In [6]:
len(gendered_sentences)

8289

In [9]:
# Extract the POS tags for these gendered sentences
gendered_sentences_pos = []
for idx, sent in gendered_sentences:
    gendered_sentences_pos.append(all_sentences_pos[idx])

In [10]:
# load male to female mapping for gender-specific words
word_data = None
with open('../code_base/cda_default_pairs.json', 'r') as f:
    word_data = json.load(f)

In [11]:
# convert to a dict for easier mapping
word_mapping_male_2_female = {}
for mapping in word_data:
    word_mapping_male_2_female[mapping[0]] = mapping[1]
word_mapping_female_2_male = {value: key for key, value in word_mapping_male_2_female.items()}

In [32]:
word_mapping_male_2_female

{'gods': 'goddesses',
 'manager': 'manageress',
 'barons': 'baronesses',
 'nephew': 'niece',
 'prince': 'princess',
 'boars': 'sows',
 'baron': 'baroness',
 'stepfathers': 'stepmothers',
 'wizard': 'witch',
 'father': 'mother',
 'stepsons': 'stepdaughters',
 'sons-in-law': 'daughters-in-law',
 'dukes': 'duchesses',
 'boyfriend': 'girlfriend',
 'fiances': 'fiancees',
 'dad': 'mom',
 'shepherd': 'shepherdess',
 'uncles': 'aunts',
 'beau': 'belle',
 'males': 'females',
 'hunter': 'huntress',
 'beaus': 'belles',
 'grandfathers': 'grandmothers',
 'lads': 'lasses',
 'daddies': 'mummies',
 'step-son': 'step-daughter',
 'masters': 'mistresses',
 'policeman': 'policewoman',
 'nephews': 'nieces',
 'brother': 'sister',
 'grandfather': 'grandmother',
 'priest': 'priestess',
 'hosts': 'hostesses',
 'landlord': 'landlady',
 'husband': 'wife',
 'poet': 'poetess',
 'landlords': 'landladies',
 'fathers': 'mothers',
 'masseur': 'masseuse',
 'monks': 'nuns',
 'usher': 'usherette',
 'hero': 'heroine',
 's

In [14]:
def swap_gendered_words(sentence, pos_tags):
    flipped_sentence = []
    for idx, word in enumerate(sentence):
        pos_tag = pos_tags[idx]
        if word == "him":
            flipped_sentence.append("her")
        elif word == "his" and pos_tag == "NNS":
            flipped_sentence.append("hers")
        elif word == "his" and (pos_tag == "PRP" or pos_tag == "PRP$"):
            flipped_sentence.append("her")
        elif word == "her" and pos_tag == "PRP$":
            flipped_sentence.append("his")
        elif word == "her" and pos_tag == "PRP":
            flipped_sentence.append("him")
        elif word == "hers":
            flipped_sentence.append("his")
        elif word in word_mapping_male_2_female.keys():
            flipped_sentence.append(word_mapping_male_2_female[word])
        elif word in word_mapping_female_2_male.keys():
            flipped_sentence.append(word_mapping_female_2_male[word])
        elif pos_tag == "NNP":
            # replace with a mask
            flipped_sentence.append("NAME")
        else:
            flipped_sentence.append(word)
    
    return flipped_sentence

In [18]:
def lower_case_sentences(sentence):
    sentence = [word.lower() for word in sentence]
    return sentence

def swap_sentences():
    swapped_sentences = []
    for idx, sentence in enumerate(gendered_sentences):
        sentence = lower_case_sentences(sentence[1])
        swapped_sent = swap_gendered_words(sentence, gendered_sentences_pos[idx])
        swapped_sentences.append(swapped_sent)
    return swapped_sentences

In [19]:
swapped_sentences = swap_sentences()

In [20]:
len(swapped_sentences)

8289

In [24]:
# check swapping for first 5 sentences
for idx in range(5):
    print("Original sentence: ", gendered_sentences[idx][1])
    print("Swapped sentence: ", swapped_sentences[idx])
    print("\n")

Original sentence:  ['Lee', 'relates', 'that', 'as', 'a', 'youth', 'he', 'studied', 'because', 'he', 'was', 'forced', 'to', ',', 'but', 'now', 'his', 'motivations', 'for', 'learning', 'are', 'far', 'greater', '-', 'so', 'great', 'in', 'fact', 'that', 'he', 'registered', 'for', 'three', 'classes', 'at', 'the', 'start', 'of', 'the', 'semester', ',', 'a', 'decision', 'he', 'has', 'come', 'to', 'regret', 'for', 'lack', 'of', 'time', '.']
Swapped sentence:  ['NAME', 'relates', 'that', 'as', 'a', 'youth', 'she', 'studied', 'because', 'she', 'was', 'forced', 'to', ',', 'but', 'now', 'her', 'motivations', 'for', 'learning', 'are', 'far', 'greater', '-', 'so', 'great', 'in', 'fact', 'that', 'she', 'registered', 'for', 'three', 'classes', 'at', 'the', 'start', 'of', 'the', 'semester', ',', 'a', 'decision', 'she', 'has', 'come', 'to', 'regret', 'for', 'lack', 'of', 'time', '.']


Original sentence:  ['Ever', 'since', 'graduating', 'with', 'a', 'degree', 'in', 'mechanical', 'engineering', 'from', 