## Imports

In [6]:
import re
from tqdm import tqdm
from collections import defaultdict
import os
from nltk.tag import pos_tag
import langid
import json
import traceback
import sys

## Utils Functions

In [7]:
def is_english(word):
    word_lang, _ = langid.classify(word)
    return True if word_lang == 'en' else False

In [8]:
def sort_dict(d):
    """sorts dictionary and returns a reverese ordered
    list of (key, value) tuples"""
    return [(s, d[s]) for s in sorted(d, key=d.get, reverse=True)]

In [9]:
def get_words_from_file(file, cat):
    d = defaultdict(int)
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            word, count, file_cat = [x.strip() for x in line.split(',')]
            count = int(count)
            if file_cat == cat:
                d[word] = count
    return d

## Corpus import

In [14]:
data_dir = '../data/openwebtext-200M/openwebtext/'

In [15]:
# iterate over files in that directory
filenames = []

for filename in os.listdir(data_dir):
    f = os.path.join(data_dir, filename)
    # checking if it is a file
    if os.path.isfile(f):
        filenames.append(f)

In [16]:
len(filenames)

2767

### Corpus Stats

In [None]:
# count how many words have been processed already and how long it takes
import numpy as np
num_words = 0
doc_lens = []

for i,f in enumerate(tqdm(filenames)):
    with open(f, 'r', encoding='utf-8') as f:
        doc_num_words = sum([len(line.strip().split()) for line in f.readlines()])
        num_words += doc_num_words
        doc_lens.append(doc_num_words)

100%|██████████| 2767/2767 [01:48<00:00, 25.43it/s]


In [None]:
# format num words with commas for readability
print('overall # of words:\t{:,}'.format(num_words))
print('# of docs:\t\t{:,}'.format(len(doc_lens)))
print('mean # words per doc:\t{:,}'.format(np.round(np.mean(np.array(doc_lens)),2)))

overall # of words:	551,813,278
# of docs:		2,767
mean # words per doc:	199,426.56


## Regex

### Suffixes

In [18]:
# pattern = re.compile(r'(?<!wo)man$')
man_suffix = re.compile(r'^[A-Za-z]+-?(?<!wo|hu)m[ae]n$')
woman_suffix = re.compile(r'^[A-Za-z]+-?wom[ae]n$')
boy_suffix = re.compile(r'^[A-Za-z]+-?boys?$')
girl_suffix = re.compile(r'^[A-Za-z]+-?girls?$')

In [19]:
print(man_suffix.search('line-man'))

<re.Match object; span=(0, 8), match='line-man'>


### Prefixes

In [20]:
man_prefix = re.compile(r'^man-[A-Za-z]{2,}$')
woman_prefix = re.compile(r'^woman-?[A-Za-z]{2,}$')
boy_prefix = re.compile(r'^boy-?[A-Za-z]{2,}$')
girl_prefix = re.compile(r'^girl-?[A-Za-z]{2,}$')

In [21]:
print(man_prefix.search('man-bun'))

<re.Match object; span=(0, 7), match='man-bun'>


## Prefix search

In [23]:
prfx_file = '../words/prefixes.json'

In [24]:
# set up dictionaries for counting words with gendered prefixes
man_pre_words = defaultdict(int)
boy_pre_words = defaultdict(int)
woman_pre_words = defaultdict(int)
girl_pre_words = defaultdict(int)

prefixes = {'man': man_pre_words,
            'woman': woman_pre_words,
            'girl': girl_pre_words,
            'boy': boy_pre_words}

In [16]:
# # load already processed prefixed words
# prefixes = json.load(open(prfx_file, 'r'))
# prefixes = {k: defaultdict(int, v) for k, v in prefixes.items()} # turn dicts into defaultdicts
# print(prefixes.keys())

dict_keys(['man', 'woman', 'girl', 'boy'])


In [17]:
def prefix_search(sample_filenames:list, start_doc:int, prefixes:dict):
    docs = sample_filenames[start_doc:]
    i = 0
    try:
        for i,f in enumerate(tqdm(docs, total=len(docs))):
            with open(f, 'r', encoding='utf-8') as f:
                sents = [pos_tag(line.strip().split()) for line in f.readlines()] # tagset = 'universal'

            for sentence in sents:
                for idx, (word,tag) in enumerate(sentence):
                    if word[0].lower() != word[0]: # if word is capitalized
                        if idx > 0: # and not at the start of a sentence
                            continue # it must be a proper name and therefore will be excluded from the analysis
                    # word = word.lower()
                    if tag == 'NN':
                        if man_prefix.search(word) and is_english(word):
                            prefixes['man'][word] += 1
                        elif woman_prefix.search(word) and is_english(word):
                            prefixes['woman'][word] += 1
                        elif boy_prefix.search(word) and is_english(word):
                            prefixes['boy'][word] += 1
                        elif girl_prefix.search(word) and is_english(word):
                            prefixes['girl'][word] += 1
            
            if i%20 == 0 and i > 0:
                json.dump(prefixes, open(prfx_file, 'w'))
                
    except KeyboardInterrupt as e:
        print('Interrupted at doc', start_doc + i)
        print('Error message: ', e)
        json.dump(prefixes, open(prfx_file, 'w'))
        print('prefixes saved to file')
        
    except Exception as e:
        print(traceback.format_exc())
        # or
        # print(sys.exc_info())
    
    return prefixes, start_doc + i
    # if i%50 == 0:
    #     print(man_pre_words,'\n',
    #           boy_pre_words,'\n',
    #           woman_pre_words,'\n',
    #           girl_pre_words)

In [18]:

prefixes, next_doc = prefix_search(sample_filenames=filenames, 
                                   start_doc=1292+1165, # function would start from this document; if interrupted it prints the last document processed
                                   prefixes=prefixes)
print(next_doc)

100%|██████████| 310/310 [40:57<00:00,  7.93s/it]

2766





In [None]:
json.dump(prefixes, open(prfx_file, 'w'))

## Suffix Search

In [None]:
suffix_file = '../words/suffixes.txt'

In [None]:
# if starting from scratch:
man_words = defaultdict(int)
boy_words = defaultdict(int)
woman_words = defaultdict(int)
girl_words = defaultdict(int)

In [19]:
# # # if working from an already saved file:
# man_words = get_words_from_file(suffix_file, 'man')
# boy_words = get_words_from_file(suffix_file, 'boy')
# woman_words = get_words_from_file(suffix_file, 'woman')
# girl_words = get_words_from_file(suffix_file, 'girl')

In [20]:
# these words were manually updated while traversing throught the files
not_man_words = ['semen', 'abdomen','specimen', 'regimen', 'aman', 'ramen', 'german',
                 'acumen', 'omen', 'amen', 'pullman', 'lumen', 'hymen',
                 'iman', 'hooman', 'oman', 'pacman',
                 'praenomen','agnomen', 'maman', 'brahman', 'hooman',
                 'bitumen', 'roman', 'shaman','talisman', 'summan', 'caiman',
                 'ebubman', 'demoman', 'bman', 'afirman', 'gaiman', 'liesman',
                 'wedeman', 'wruckman', 'pemahaman', 'atman', 'ottoman', 'silberman',
                 'hdman', 'krugman', 'leatherman', 'shuvman', 'fineman',
                 'mehlman', 'herman', 'fman', 'norman', 'cuthman', 'streamingfeynman',
                 'queman', 'zodman', 'camerman', 'tishman', 'bynorman', 'toman', 'chariman',
                 'letterman', 'bruinbirdman', 'progman', 'hedman', 'portiman'
                 'zaman','musulman','askmen','euthman','informan', 'yoman',
                 'mnkiteman', 'prguitarman', 'aseman','femailman', 'brockman',
                 'metroaseman', 'hieman', 'spokesoman', 'udaman','willyloman',
                 'foeman','theman', 'birman', 'dragoman', 'desmen','duramen',
                 'energumen','entertainmen', 'moeman', 'chapman', 'brakeman',
                 'bateman','terman', 'hanuman', 'oknoman', 'raman', 'd-man',
                 'bachman', 'idaman', 'libman', 'naman', 'kclightman', 'forman', 
                 'scottbateman', 'rodman', 'teman', 'confirman','teman','pman',
                 'subsman', 'emkman', 'bettman', 'programan', 'speciman'
                 'mrfredman', 'newman', 'hofman', 'mechman', 'panchoman','sherman',
                 'ragouman', 'reddsman', 'shortman', 'redman', 'goodman',
                 'heythereman', 'soreloserman', 'tedman', 'hoffman', 'grossman',
                 'husbandman', 'draftman', 'himan', 'apeman', 'suman', 'pixman',
                 'bonussumman', 'inman', 'disman', 'acloudman', 'languagespokesman',
                 'ackerman', 'sopkesman', 'thaman', 'dbman', 'exceptbatman',
                 'yacman', 'anonoman','pakman', 'harman', 'rashoman', 'saman',
                 'z-man', 'blakeman', 'wo-man', 'gaman', 'fiterman', 'vuvman',
                 'wooman', 'wasserman', 'mrfredman', 'funnyman', 'anatman',
                 'picman', 'zaman','repeatdoteatingsoundpacman',
                 'repeatdoteatingsoundmspacman', 'zimmerman', 'rgriman', 'be-man'
                 'livinlowcarbman', 'desman','finoman', 'ilman', 'booman',
                 'thethinkingman', 'hu-man', 'chaiman', 'timen', 'minuman', 'ousman',
                 'pathman','stantheman', 'doman', 'rman','evilman', 'grypsman',
                 'ruckman', 'perfectman', 'tredman', 'carasulieman', 'legitiman',
                 'bokuman', 'tannerfriedman','friedman', 'iseman', 'daleeman',
                 'mgviperman', 'dabookerman', 'gworkman', 'truman', 'halaman',
                 'rothman', 'gilman', 'replaymoman', 'bananaman', 'tallman',
                 'insta-man', 'be-man', 'pac-man', 'kitman', 'jpiceman',
                 'formerspokesman', 'diocesanspokesman', 'albumen', 'estiman',
                 'weigman', 'gladman', 'akman', 'lindeman', 'nocman','croman',
                 'moman', 'agilman', 'heyman', 'hyneman','pleaseletmewritebatman',
                 'smkngman', 'berman', 'asuman', 'tkman', 'enigman', 'traneman',
                 'direman', 'tilman', 'portiman', 'speciman', 'toughman', 'stillman',
                 'herreman', 'stillman', 'hardman', 'magman', 'willman', 'brightman',
                 'tfsherman', 'usman', 'enterdaveman']

for mw in not_man_words:
    if mw in man_words.keys():
        man_words.pop(mw)
        print(f'{mw} is not a man-word!')

magman is not a man-word!
willman is not a man-word!
brightman is not a man-word!
tfsherman is not a man-word!
usman is not a man-word!


In [21]:
# these words were manually updated while traversing throught the files
not_woman_words = ['thiswoman', 'bigotedwoman', 'smartwoman', 'spokestypewoman',
                   'compwoman', 'lookforthewoman', 'playfulwoman']

for mw in not_woman_words:
    if mw in woman_words.keys():
        woman_words.pop(mw)

In [22]:
# these words were manually updated while traversing throught the files
not_boy_words = ['carboy','gameboy','flyboy', 'gaymormonboy', 'ohboy', 'pipboy',
                 'room101bellboy', 'rentboy', 'cogboy', 'bioboy', 'burna_boy',
                 'hoooboy', 'jcowboy', 'sorosboy','bboy','grrlboy', 'speedycowboy',
                 'volkanoboy', 'skyboy', 'poorboy', 'wakeboy', 'blondboy', 'eboy',
                 'oboy', 'fatboy', 'b-boys', 'spamcowboy', 'blonboy', 'weboy',
                 'dshaboy', 'bellybuttonboy', 'prettyboy','sickboy', 'speckyboy',
                 'morayboy', 'tallboy', 'big-boy', 'tuartboy', 'annaplayboy',
                 'dylanboy']

for mw in not_boy_words:
    if mw in boy_words.keys():
        boy_words.pop(mw)

In [23]:
# these words were manually updated while traversing throught the files
not_girl_words = ['daboardergirl', 'thosewilsongirls', 'hotandnaughtylittlegirl',
                  'biogirl', 'hotnsxygirl', 'tgirl', 'scoopgirl', 'lonelygirl',
                  'pygirl', 'agirl', 'sweetbbgirl', 'thewantedgirl', 'garciagirl']

for mw in not_girl_words:
    if mw in girl_words.keys():
        girl_words.pop(mw)

### Write suffix dicts to file

In [27]:
# sorted(a1, key=a1.get, reverse=True)
with open(suffix_file, 'w', encoding='utf-8') as f:
    for k, v in sort_dict(man_words):
        f.write(f'{k}, {v}, {"man"}\n')
    for k, v in sort_dict(woman_words):
        f.write(f'{k}, {v}, {"woman"}\n')
    for k, v in sort_dict(boy_words):
        f.write(f'{k}, {v}, {"boy"}\n')
    for k, v in sort_dict(girl_words):
        f.write(f'{k}, {v}, {"girl"}\n')

###  Traverse corpus for words with suffixes

In [None]:
# indices 1080-1100 were updated manually for traversing the corpus 20 files at a time
for i,f in enumerate(tqdm(filenames[1080:1100], total=20)):
    print(f'Doc #{i}')
    with open(f, 'r', encoding='utf-8') as f:
        sents = [pos_tag(line.strip().split()) for line in f.readlines()] # tagset = 'universal'
    for sentence in sents:
        for idx, (word,tag) in enumerate(sentence):
            if word[0].lower() != word[0]: # if word is capitalized
                if idx > 0: # and not at the start of a sentence
                    continue # it must be a proper name and therefore will be excluded from the analysis
            word = word.lower()
            if tag == 'NN' and word not in not_man_words:
                if man_suffix.search(word) and is_english(word):
                    man_words[word] += 1
                elif woman_suffix.search(word) and is_english(word):
                    woman_words[word] += 1
                elif boy_suffix.search(word) and is_english(word):
                    boy_words[word] += 1
                elif girl_suffix.search(word) and is_english(word):
                    girl_words[word] += 1


  0%|          | 0/20 [00:00<?, ?it/s]

Doc #0


  5%|▌         | 1/20 [00:08<02:38,  8.36s/it]

Doc #1


 10%|█         | 2/20 [00:17<02:34,  8.56s/it]

Doc #2


 15%|█▌        | 3/20 [00:25<02:28,  8.73s/it]

Doc #3


 20%|██        | 4/20 [00:34<02:19,  8.73s/it]

Doc #4


 25%|██▌       | 5/20 [00:43<02:11,  8.77s/it]

Doc #5


 30%|███       | 6/20 [00:52<02:03,  8.84s/it]

Doc #6


 35%|███▌      | 7/20 [01:00<01:51,  8.60s/it]

Doc #7


 40%|████      | 8/20 [01:09<01:43,  8.65s/it]

Doc #8


 45%|████▌     | 9/20 [01:17<01:33,  8.51s/it]

Doc #9


 50%|█████     | 10/20 [01:26<01:25,  8.57s/it]

Doc #10


 55%|█████▌    | 11/20 [01:34<01:16,  8.52s/it]

Doc #11


 60%|██████    | 12/20 [01:43<01:09,  8.68s/it]

Doc #12


 65%|██████▌   | 13/20 [01:52<01:00,  8.65s/it]

Doc #13


 70%|███████   | 14/20 [02:00<00:51,  8.55s/it]

Doc #14


 75%|███████▌  | 15/20 [02:09<00:43,  8.68s/it]

Doc #15


 80%|████████  | 16/20 [02:18<00:34,  8.75s/it]

Doc #16


 85%|████████▌ | 17/20 [02:27<00:26,  8.83s/it]

Doc #17


 90%|█████████ | 18/20 [02:36<00:17,  8.87s/it]

Doc #18


 95%|█████████▌| 19/20 [02:45<00:08,  8.89s/it]

Doc #19


100%|██████████| 20/20 [02:54<00:00,  8.71s/it]


## -(wo)manship words

In [25]:
def manships_search(sample_filenames:list, start_doc:int):
    docs = sample_filenames[start_doc:]
    i = 0
    manships = defaultdict(int)
    try:
        for i,f in enumerate(tqdm(docs, total=len(docs))):
            with open(f, 'r', encoding='utf-8') as f:
                sents = [line.strip().split() for line in f.readlines()] # tagset = 'universal'

            for sentence in sents:
                for word in sentence:
                    if word.lower().endswith('manship'):
                        manships[word] += 1
                
    except KeyboardInterrupt as e:
        print('Interrupted at doc', start_doc + i)
        print('Error message: ', e)
        print(manships)
        print('prefixes saved to file')
        
    except Exception as e:
        print(traceback.format_exc())
        # or
        # print(sys.exc_info())
    
    return manships, start_doc + i

In [26]:
manship_raw, index = manships_search(filenames, 0)

100%|██████████| 2767/2767 [04:49<00:00,  9.57it/s]


In [27]:
manship_words = set([x.lower() for x in manship_raw.keys() if x.isalpha()])
manship_words = {x: manship_raw[x] for x in manship_words}

In [28]:
print(len(manship_words))

53


In [31]:
# order the words by frequency
sorted_manship = sort_dict(manship_words)

In [32]:
def womanships_search(sample_filenames:list, start_doc:int):
    docs = sample_filenames[start_doc:]
    i = 0
    womanships = defaultdict(int)
    try:
        for i,f in enumerate(tqdm(docs, total=len(docs))):
            with open(f, 'r', encoding='utf-8') as f:
                sents = [line.strip().split() for line in f.readlines()] # tagset = 'universal'

            for sentence in sents:
                for word in sentence:
                    if word.lower().endswith('womanship'):
                        womanships[word] += 1
                
    except KeyboardInterrupt as e:
        print('Interrupted at doc', start_doc + i)
        print('Error message: ', e)
        print(womanships)
        print('prefixes saved to file')
        
    except Exception as e:
        print(traceback.format_exc())
        # or
        # print(sys.exc_info())
    
    return womanships, start_doc + i

In [33]:
womanship_raw, index = womanships_search(filenames, 0)

100%|██████████| 2767/2767 [03:36<00:00, 12.75it/s]


In [34]:
womanship_raw

defaultdict(int, {'stateswomanship': 2, 'workwomanship': 2})

In [42]:
womanship_words = set([x.lower() for x in womanship_raw if x.isalpha()])
womanship_words = {x: womanship_raw[x] for x in womanship_words}

In [43]:
len(womanship_words)

2

In [44]:
womanship_words

{'workwomanship': 2, 'stateswomanship': 2}

### Save words to file

In [46]:
ships_dict = {'womanship': womanship_words, 'manship': manship_words}
json.dump(ships_dict, open('../words/wo_and_manships.json', 'w'))