In [44]:
import pandas as pd
import numpy as np
import csv
import json
import re
from allennlp_models.common.ontonotes import Ontonotes

In [3]:
onto = Ontonotes()
ds = onto.dataset_iterator('../code_base/ontonotes_data/train/data/english/annotations')
all_sentences = []
all_sentences_entities = []
all_sentences_pos = []
for x in ds:
    sentences = x.words
    pos_tags = x.pos_tags
    all_sentences.append(sentences)
    all_sentences_pos.append(pos_tags)

In [4]:
len(all_sentences)

115812

In [95]:
class PrepareDataForAugmentation():
    def __init__(self, data, data_pos_tags):
        self.data = data
        self.pos_tags = data_pos_tags
        self.gendered_sentences = []
        self.gendered_sentences_pos = []
        self.word_mapping_male_2_female = {}
        self.word_mapping_female_2_male = {}
        self.swapped_sentences = []
        self.masked_orig_sentences = []
        self.masked_swapped_sentences = []
        
    def extract_gendered_sentences(self):
        pronoun_list = ["he" "him", "his", "her", "she"]
        for idx, sentence in enumerate(self.data):
            for word in sentence:
                if word in pronoun_list:
                    # gendered pronoun present
                    self.gendered_sentences.append((idx, sentence))
                    break
    
    def extract_gendered_sent_pos_tags(self):
        for idx, sent in self.gendered_sentences:
            self.gendered_sentences_pos.append(self.pos_tags[idx])
            
    def create_word_male_female_mappings(self):
        # load male to female mapping for gender-specific words
        word_data = None
        with open('../code_base/cda_default_pairs.json', 'r') as f:
            word_data = json.load(f)
        
        # convert to a dict for easier mapping
        for mapping in word_data:
            self.word_mapping_male_2_female[mapping[0]] = mapping[1]
        self.word_mapping_female_2_male = \
            {value: key for key, value in self.word_mapping_male_2_female.items()}
        
    def swap_gendered_words(self, sentence, pos_tags):
        flipped_sentence = []
        for idx, word in enumerate(sentence):
            pos_tag = pos_tags[idx]
            if word == "him":
                flipped_sentence.append("her")
            elif word == "his" and pos_tag == "NNS":
                flipped_sentence.append("hers")
            elif word == "his" and (pos_tag == "PRP" or pos_tag == "PRP$"):
                flipped_sentence.append("her")
            elif word == "her" and pos_tag == "PRP$":
                flipped_sentence.append("his")
            elif word == "her" and pos_tag == "PRP":
                flipped_sentence.append("him")
            elif word == "hers":
                flipped_sentence.append("his")
            elif word in word_mapping_male_2_female.keys():
                flipped_sentence.append(word_mapping_male_2_female[word])
            elif word in word_mapping_female_2_male.keys():
                flipped_sentence.append(word_mapping_female_2_male[word])
            elif pos_tag == "NNP":
                # replace with a mask
                flipped_sentence.append("NAME")
            else:
                flipped_sentence.append(word)

        return flipped_sentence
    
    def lower_case_sentences(self, sentence):
        sentence = [word.lower() for word in sentence]
        return sentence

    def swap_sentences(self):
        swapped_sentences = []
        for idx, sentence in enumerate(self.gendered_sentences):
            sentence = self.lower_case_sentences(sentence[1])
            swapped_sent = \
                self.swap_gendered_words(sentence, self.gendered_sentences_pos[idx])
            swapped_sentences.append(swapped_sent)
        return swapped_sentences
    
    def generate_swapped_sentences(self):
        self.extract_gendered_sentences()
        self.extract_gendered_sent_pos_tags()
        self.create_word_male_female_mappings()
        self.swapped_sentences = self.swap_sentences()
        return self.swapped_sentences
    
    def combine_gendered_swapped_sentences(self):
        combined_original_sentences = []
        combined_swapped_sentences = []
        for sent in self.gendered_sentences:
            combined_original_sentences.append(' '.join(sent[1]))
        for sent in self.swapped_sentences:
            combined_swapped_sentences.append(' '.join(sent))
        return combined_original_sentences, combined_swapped_sentences
    
    # Used to mask the gender pronouns, so that we can predict these
    # using our model
    def replace_pronouns(self, data):
        masked_sentences = []
        for sent in data:
            sent = sent.lower()
            pronoun_regex = r"\bhis\b|\bher\b|\bhim\b|\bshe\b|\bhe\b|\bhers\b"
            matched_strings = re.findall(pronoun_regex, sent)
            for gender_pronoun in matched_strings:
                pronoun = gender_pronoun
                masked_sent = re.sub(pronoun_regex, "<MASK>", sent)
                masked_sentences.append([masked_sent, pronoun])
        return masked_sentences
    
    def mask_gender_pronouns(self):
        orig_sentences, swapped_sentences = \
            self.combine_gendered_swapped_sentences()
        self.masked_original_sentences = self.replace_pronouns(orig_sentences)
        self.masked_swapped_sentences = self.replace_pronouns(swapped_sentences)

In [96]:
processed_data = PrepareDataForAugmentation(all_sentences, all_sentences_pos)
processed_data.generate_swapped_sentences()

[['NAME',
  'relates',
  'that',
  'as',
  'a',
  'youth',
  'she',
  'studied',
  'because',
  'she',
  'was',
  'forced',
  'to',
  ',',
  'but',
  'now',
  'her',
  'motivations',
  'for',
  'learning',
  'are',
  'far',
  'greater',
  '-',
  'so',
  'great',
  'in',
  'fact',
  'that',
  'she',
  'registered',
  'for',
  'three',
  'classes',
  'at',
  'the',
  'start',
  'of',
  'the',
  'semester',
  ',',
  'a',
  'decision',
  'she',
  'has',
  'come',
  'to',
  'regret',
  'for',
  'lack',
  'of',
  'time',
  '.'],
 ['ever',
  'since',
  'graduating',
  'with',
  'a',
  'degree',
  'in',
  'mechanical',
  'engineering',
  'from',
  'NAME',
  'and',
  'setting',
  'out',
  'to',
  'start',
  'her',
  'own',
  'business',
  'over',
  '20',
  'years',
  'ago',
  ',',
  'that',
  'degree',
  'has',
  'sat',
  'in',
  'a',
  'drawer',
  'somewhere',
  ',',
  'relegated',
  'to',
  'mere',
  'symbolic',
  'significance',
  '.'],
 ['NAME',
  'NAME',
  '-',
  'NAME',
  ',',
  'a',
  'r

In [97]:
processed_data.mask_gender_pronouns()

In [102]:
processed_data.masked_original_sentences[:5]

[['lee relates that as a youth <MASK> studied because <MASK> was forced to , but now <MASK> motivations for learning are far greater - so great in fact that <MASK> registered for three classes at the start of the semester , a decision <MASK> has come to regret for lack of time .',
  'he'],
 ['lee relates that as a youth <MASK> studied because <MASK> was forced to , but now <MASK> motivations for learning are far greater - so great in fact that <MASK> registered for three classes at the start of the semester , a decision <MASK> has come to regret for lack of time .',
  'he'],
 ['lee relates that as a youth <MASK> studied because <MASK> was forced to , but now <MASK> motivations for learning are far greater - so great in fact that <MASK> registered for three classes at the start of the semester , a decision <MASK> has come to regret for lack of time .',
  'his'],
 ['lee relates that as a youth <MASK> studied because <MASK> was forced to , but now <MASK> motivations for learning are far g

In [103]:
processed_data.masked_swapped_sentences[:5]

[['name relates that as a youth <MASK> studied because <MASK> was forced to , but now <MASK> motivations for learning are far greater - so great in fact that <MASK> registered for three classes at the start of the semester , a decision <MASK> has come to regret for lack of time .',
  'she'],
 ['name relates that as a youth <MASK> studied because <MASK> was forced to , but now <MASK> motivations for learning are far greater - so great in fact that <MASK> registered for three classes at the start of the semester , a decision <MASK> has come to regret for lack of time .',
  'she'],
 ['name relates that as a youth <MASK> studied because <MASK> was forced to , but now <MASK> motivations for learning are far greater - so great in fact that <MASK> registered for three classes at the start of the semester , a decision <MASK> has come to regret for lack of time .',
  'her'],
 ['name relates that as a youth <MASK> studied because <MASK> was forced to , but now <MASK> motivations for learning are

In [100]:
len(processed_data.masked_swapped_sentences)

13915

In [101]:
len(processed_data.masked_original_sentences)

13915