In [275]:
from collections import defaultdict, Counter
from tqdm import tqdm
import csv
import os
import pandas as pd
import numpy as np
import random

# Gender inference

Many of the conversations in the manosphere focus on relationships between men and women. 

In [107]:
ROOT = '/mnt/data0/lucy/manosphere/'
ANN_FILE = ROOT + 'data/ann_sig_entities.csv'
COREF_RESULTS = ROOT + 'logs/coref_results/'

In [188]:
def load_vocabulary(): 
    words = []
    with open(ANN_FILE, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader: 
            if row['keep'] == 'Y': 
                if row['entity'].lower() == 'she' or row['entity'].lower() == 'he': 
                    continue
                words.append(row['entity'].lower())
    return words

Check that categories are all neat and clean with no NaNs. 

In [376]:
reddit_df = pd.read_csv(COREF_RESULTS + 'coref_reddit_df.csv')
cats = set(reddit_df.community.unique()) | set(forum_df.community.unique())
assert len(cats) == len(set(reddit_df.community.unique())) + len(set(forum_df.community.unique()))
forum_df = pd.read_csv(COREF_RESULTS + 'coref_forum_df.csv')
df = pd.concat([reddit_df, forum_df])
control_df = pd.read_csv(COREF_RESULTS + 'coref_CONTROL_df.csv')
all_df = pd.concat([reddit_df, forum_df, control_df])
cats = set(all_df.community.unique())
print(cats)

{'Femcels', 'the_attraction', 'TRP', 'pua_forum', 'MRA', 'red_pill_talk', 'Incels', 'MGTOW', 'rooshv', 'mgtow', 'avfm', 'incels', 'FDS', 'PUA', 'CONTROL'}


In [368]:
fem_total = sum(df['fem'].to_list())
masc_total = sum(df['masc'].to_list())
print("% fem:", fem_total / (fem_total + masc_total))
print("% masc:", masc_total / (fem_total + masc_total))

% fem: 0.5852356166540743
% masc: 0.4147643833459257


### Word coverage

We want to get a sense of how many words are gendered. Words that are not gendered likely don't show up often enough to matter much, but there is a long tail that could be important to consider. 

First, we get a sense of how many words actually have coref labels (e.g. more than 20 labels): 

In [414]:
def get_dataframe_coverage(this_df, cutoff, marked_set): 
    '''
    @inputs: 
    - this_df: input dataframe
    - cutoff: int indicating frequency cutoff
    - marked_set: set of words gendered by definition
    @outputs: 
    - missing: words missing masc/fem gender signal
    - solid_labels: words with gender coref fem/masc signal
    '''
    totals = this_df.groupby('word').sum()
    totals['total'] = totals['fem'] + totals['masc']
    totals = totals[totals['total'] > cutoff]
    totals.sort_values(by=['total'])
    solid_labels = set(totals.index.to_list())
    vocab = load_vocabulary()
    missing = set(vocab) - solid_labels - marked_set
    print("NO GENDER SIGNAL:", len(missing) / len(vocab))
    print("COREF SIGNAL:",len(solid_labels)/ len(vocab))
    print("MARKED SIGNAL:", len(marked_set) / len(vocab))
    print()
    return missing, solid_labels

In [415]:
mano_missing, mano_solid_labels = get_dataframe_coverage(df, 20, set())
control_missing, control_solid_labels = get_dataframe_coverage(control_df, 20, set())

NO GENDER SIGNAL: 0.688549387384662
COREF SIGNAL: 0.3114506126153381
MARKED SIGNAL: 0.0

NO GENDER SIGNAL: 0.8534261080018152
COREF SIGNAL: 0.14657389199818485
MARKED SIGNAL: 0.0



The number of words without coref signals is pretty low! It seems like we need **_multiple ways_** to infer the gender of an entity. 

**Step 1**: semantically gendered nouns, or nouns gendered by definition

We left out nouns that are socially gendered, e.g. "nurse". We use singular and plural forms. 

Hoyle et al. (2019) - man, men, boy, boys, father, fathers, son, sons, brother, bothers, husband, husbands, uncle, uncles, nephew, nephews, emperor, emperors, king, kings, prince, princes, duke, dukes, lord, lords, knight, knights, waiter, waiters, actor, actors, god, gods, policeman, policemen, postman, postmen, hero, heros, wizard, wizards, steward, stewards, woman, women, girl, girls, mother, mothers, daughter, daughters, sister, sisters, wife, wives, aunt, aunts, niece, nieces, empress, empresses, queen, queens, princess, princesses, duchess, duchesses, lady, ladies, dame, dames, waitress, waitresses, actress, actresses, goddess, goddesses, policewoman, policewomen, postwoman, postwomen, heroine, heroines, witch, witches, stewardess, stewardesses

Additional - male, males, dude, dudes, guy, guys, boyfriend, boyfriends, bf, female, females, chick, chicks, girlfriend, girlfriends, gf, gal, gals

In [398]:
men_markers = 'man, men, boy, boys, father, fathers, son, sons, brother, bothers, husband, husbands, uncle, uncles, nephew, nephews, emperor, emperors, king, kings, prince, princes, duke, dukes, lord, lords, knight, knights, waiter, waiters, actor, actors, god, gods, policeman, policemen, postman, postmen, hero, heros, wizard, wizards, steward, stewards, '
men_markers += 'male, males, dude, dudes, boyfriend, boyfriends, bf, guy, guys'
men_markers = set(men_markers.split(', '))
women_markers = 'woman, women, girl, girls, mother, mothers, daughter, daugheters, sister, sisters, wife, wives, aunt, aunts, niece, nieces, empress, empresses, queen, queens, princess, princesses, duchess, duchesses, lady, ladies, dame, dames, waitress, waitresses, actress, actresses, goddess, goddesses, policewoman, policewomen, postwoman, postwomen, heroine, heroines, witch, witches, stewardess, stewardesses, '
women_markers += 'female, females, chick, chicks, girlfriend, girlfriends, gf, gal, gals'
women_markers = set(women_markers.split(', '))
print(men_markers)
print(women_markers)

{'fathers', 'emperors', 'actors', 'boy', 'kings', 'dude', 'dudes', 'postmen', 'nephew', 'hero', 'male', 'steward', 'sons', 'postman', 'lord', 'bf', 'king', 'waiter', 'boyfriend', 'bothers', 'knights', 'wizards', 'brother', 'boyfriends', 'males', 'waiters', 'princes', 'gods', 'prince', 'uncle', 'guys', 'emperor', 'dukes', 'actor', 'nephews', 'boys', 'duke', 'father', 'husbands', 'knight', 'heros', 'policemen', 'wizard', 'stewards', 'men', 'son', 'husband', 'god', 'policeman', 'man', 'uncles', 'lords', 'guy'}
{'girls', 'wives', 'chicks', 'empresses', 'princess', 'lady', 'queen', 'actresses', 'women', 'witches', 'princesses', 'gf', 'dame', 'stewardess', 'females', 'wife', 'actress', 'duchess', 'gal', 'heroines', 'nieces', 'girlfriend', 'ladies', 'aunts', 'waitress', 'girl', 'queens', 'policewoman', 'duchesses', 'policewomen', 'waitresses', 'goddesses', 'female', 'daughter', 'mothers', 'postwomen', 'aunt', 'heroine', 'gals', 'woman', 'sisters', 'daugheters', 'empress', 'chick', 'goddess', 

In [399]:
marked_vocab_men = set()
marked_vocab_women = set()
vocab = load_vocabulary()
for w in vocab: 
    w_tokens = set(w.split())
    if w_tokens & men_markers: 
        marked_vocab_men.add(w)
    elif w_tokens & women_markers:
        marked_vocab_women.add(w)
marked_vocab = marked_vocab_men | marked_vocab_women
print("Count, fraction of vocab, examples")
print("marked men:".upper(), len(marked_vocab_men), round(len(marked_vocab_men) / len(vocab), 3), 
      random.sample(marked_vocab_men, 10))
print("marked women:".upper(), len(marked_vocab_women), round(len(marked_vocab_women) / len(vocab), 3), 
      random.sample(marked_vocab_women, 10))

Count, fraction of vocab, examples
MARKED MEN: 956 0.145 ['abusive boyfriend', 'male soldiers', 'new bf', 'dude', 'white guys', 'autistic men', 'male characters', 'homosexual men', 'many males', 'male children']
MARKED WOMEN: 1011 0.153 ['several women', 'young girls', 'confident women', 'sane woman', 'polish women', 'raped women', 'gamer girl', 'good girl', 'mother', 'more woman']


**Step 2:** coreference resolution for singular nouns 

First, we filter the dataframes to words that are not explicitly marked, and we again calculate coverage.

In [400]:
unmarked_df = df[~df['word'].isin(marked_vocab)]
unmarked_control_df = control_df[~control_df['word'].isin(marked_vocab)]
unmarked_all_df = pd.concat([unmarked_df, unmarked_control_df])

In [404]:
mano_missing, mano_solid_label = get_dataframe_coverage(unmarked_df, 20, marked_vocab)
control_missing, control_solid_label = get_dataframe_coverage(unmarked_control_df, 20, marked_vocab)

NO GENDER SIGNAL: 0.5162607774920587
COREF SIGNAL: 0.18620481016487672
MARKED SIGNAL: 0.2975344123430646

NO GENDER SIGNAL: 0.5991529269399486
COREF SIGNAL: 0.10331266071698685
MARKED SIGNAL: 0.2975344123430646



**Step 3**: plural nouns take on gender of singular nouns

First, we try just simple *-s* matching. 

**DIVYA?**
A couple of things you could play around with:
- are there marked words that we should include in our lexicon? 

In [412]:
mano_found_plural = set()
for w in mano_missing: 
    if w.endswith('s'): 
        other_w = w[:-1]
        # if singular form has coref signal or is marked
        if other_w in mano_solid_label or other_w in marked_vocab: 
            mano_found_plural.add(w)
print(len(mano_found_plural))

control_found_plural = set()
for w in control_missing: 
    if w.endswith('s'): 
        other_w = w[:-1]
        if other_w in mano_solid_label or other_w in marked_vocab: 
            control_found_plural.add(w)
            
print(len(control_found_plural))

548
606


Recalculate coverage after including plural matches. 

In [422]:
mano_found_total = mano_found_plural | marked_vocab
mano_still_missing = set(vocab) - mano_solid_labels - mano_found_total
print("NO GENDER SIGNAL:", len(missing) / len(vocab))
print(random.sample(missing, 30))
control_found_total = control_found_plural | marked_vocab
control_still_missing = set(vocab) - control_solid_labels - control_found_total
print("NO GENDER SIGNAL:", len(missing) / len(vocab))
print(random.sample(missing, 30))

NO GENDER SIGNAL: 0.5074875207986689
['caregiver', 'pleb', 'cohorts', 'swingers', 'gangster', 'nationalists', 'gamers', 'oppressors', 'hoodrats', 'person involved', 'consumers', 'fan club', 'loving partner', 'sidekick', 'fatherless children', 'supporters', 'gambler', 'multimillionaire', 'sugar babies', 'potential threat', 'white incel', 'gaming community', 'white population', 'single feminist', 'gangbanger', 'little faggot', 'normal human', 'good goy', 'annarchist', 'extreme feminists']
NO GENDER SIGNAL: 0.5074875207986689
['antagonists', 'fat friend', 'fanbase', 'transgendered people', 'attractive partner', 'basic bitch', 'legislators', 'many partners', 'currycel', 'you loser', 'jailbait', 'little child', 'other incels', 'recruits', 'such people', 'mainstream society', 'nutjob', 'religious zealots', 'internet stranger', 'serfs', 'lonely person', 'reviewers', 'educated people', 'third wheel', 'drunk person', '2 friends', 'other parties', 'fucking morons', 'few chads', 'total bitch']


**Step 4:** bigrams take on unigram gender if modifier does not change semantic gender

First, can take a glance at what unigrams tend to be commonly missing: 

In [423]:
common_missing = Counter()
for w in mano_still_missing: 
    unigram = w.split()[-1]
    common_missing[unigram] += 1
print(common_missing.most_common())

[('people', 201), ('person', 63), ('group', 36), ('children', 32), ('friends', 32), ('family', 28), ('feminists', 28), ('community', 24), ('ones', 23), ('kids', 21), ('parents', 19), ('bitches', 18), ('population', 18), ('one', 18), ('victims', 16), ('here', 14), ('members', 14), ('groups', 14), ('pussy', 14), ('partner', 14), ('generation', 13), ('feminist', 12), ('asshole', 11), ('folks', 10), ('couples', 10), ('americans', 10), ('loser', 9), ('students', 9), ('users', 9), ('team', 9), ('player', 9), ('bitch', 9), ('couple', 8), ('class', 8), ('student', 8), ('involved', 8), ('partners', 8), ('others', 7), ('parent', 7), ('gender', 7), ('victim', 7), ('virgin', 7), ('adults', 7), ('dick', 6), ('chads', 6), ('sluts', 6), ('troll', 6), ('minority', 6), ('cunt', 6), ('society', 6), ('generations', 6), ('else', 6), ('families', 6), ('party', 6), ('losers', 6), ('virgins', 6), ('fuck', 6), ('humans', 5), ('bro', 5), ('human', 5), ('asians', 5), ('slut', 5), ('alpha', 5), ('child', 5), ('b

**DIVYA?**
- get a sense of what modifiers might change the gender of a word? Feel free to print out examples if needed
- or if it's too much we could just label the modifiers that appear at least 5 times 

In [424]:
common_modifiers = Counter()
for w in mano_still_missing: 
    tokens = w.split()
    if len(tokens) > 1: 
        modifier = tokens[0]
        common_modifiers[modifier] += 1
print(common_modifiers.most_common())

[('other', 69), ('most', 38), ('many', 35), ('fucking', 31), ('good', 28), ('white', 19), ('you', 19), ('new', 17), ('one', 15), ('little', 14), ('two', 13), ('social', 13), ('great', 13), ('young', 12), ('sex', 12), ('feminist', 12), ('ugly', 11), ('potential', 11), ('whole', 11), ('total', 11), ('entire', 11), ('someone', 11), ('black', 11), ('complete', 10), ('real', 10), ('more', 10), ('actual', 9), ('beta', 9), ('average', 9), ('single', 9), ('older', 8), ('big', 8), ('few', 7), ('all', 7), ('stupid', 7), ('american', 7), ('rich', 7), ('old', 7), ('religious', 6), ('hot', 6), ('true', 6), ('poor', 6), ('only', 6), ('best', 6), ('bad', 6), ('college', 6), ('better', 6), ('crazy', 6), ('younger', 6), ('happy', 5), ('sexual', 5), ('worst', 5), ('cool', 5), ('alpha', 5), ('non', 5), ('middle', 5), ('huge', 5), ('internet', 5), ('fellow', 5), ('4', 4), ('first', 4), ('public', 4), ('perfect', 4), ('selfish', 4), ('multiple', 4), ('kind', 4), ('keyboard', 4), ('certain', 4), ('small', 4

### Popular fem words in manosphere++

In [377]:
def show_top_fem(cat, this_df): 
    cat_df = this_df[this_df.community == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['fem_frac'] = cat_totals['fem'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_fem = cat_totals[cat_totals['fem_frac'] == 1]
    return cat_fem.sort_values(by=['total'])

In [378]:
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_fem(cat, unmarked_all_df).head())

------Femcels-------
Empty DataFrame
Columns: [year, fem, masc, they, it, you, total, fem_frac]
Index: []
------the_attraction-------
             year  fem  masc  they  it  you  total  fem_frac
word                                                        
duff         6021   11     0     0   1    0     11       1.0
escort      16083   11     0     1   2    0     11       1.0
playmate    14070   12     0     0   0    0     12       1.0
hot blonde  12064   12     0     0   1    0     12       1.0
stacy       10040   12     0     0   0    1     12       1.0
------TRP-------
                year  fem  masc  they  it  you  total  fem_frac
word                                                           
hot blonde     10080   11     0     0   0    0     11       1.0
old bitch      14112   11     0     0   2    0     11       1.0
potential ltr  12099   12     0     6   8    0     12       1.0
milf           12099   12     0     1   9    0     12       1.0
housekeeper    12093   12     0     0 

### Popular masc words in reddit

In [379]:
def show_top_masc(cat, this_df): 
    cat_df = this_df[this_df.community == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['masc_frac'] = cat_totals['masc'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_masc = cat_totals[cat_totals['masc_frac'] == 1]
    return cat_masc.sort_values(by=['total'])

In [380]:
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_masc(cat, unmarked_df).head())

------Femcels-------
Empty DataFrame
Columns: [year, fem, masc, they, it, you, total, masc_frac]
Index: []
------the_attraction-------
             year  fem  masc  they  it  you  total  masc_frac
word                                                         
cabbie      10040    0    11     0   1    0     11        1.0
hitman      10047    0    11     0   0    1     11        1.0
one kid     16079    0    13     0   0    0     13        1.0
best buddy  18095    0    14     0   0    0     14        1.0
captain     16078    0    15     0   0    0     15        1.0
------TRP-------
                    year  fem  masc  they  it  you  total  masc_frac
word                                                                
beginner           14112    0    11     2  13    0     11        1.0
physicist          12099    0    11     0   1    0     11        1.0
mailman            12093    0    11     0   2    0     11        1.0
college professor  12099    0    11     0   0    0     11        1.0


### Popular neut words in reddit

In [381]:
def show_top_neut(cat, this_df): 
    cat_df = this_df[this_df.community == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['fem_frac'] = cat_totals['fem'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_neut = cat_totals[cat_totals.fem_frac.between(0.48, 0.52)]
    return cat_neut.sort_values(by=['total'], ascending = False)

In [382]:
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_neut(cat, unmarked_df).head())

------Femcels-------
            year  fem  masc  they  it  you  total  fem_frac
word                                                       
partner     4037   58    58    33  31    0    116  0.500000
child       4037   30    32     3  16    0     62  0.483871
teacher     4037   26    25     0   0    0     51  0.509804
looksmatch  4037   12    13     1  12    0     25  0.480000
classmate   4037    6     6     0   0    0     12  0.500000
------the_attraction-------
              year  fem  masc  they  it  you  total  fem_frac
word                                                         
cousin       28161  247   239     2   5    4    486  0.508230
friend here  16068   28    28     1   0    0     56  0.500000
colleague    22122   25    26     0   0    0     51  0.490196
caveman      18090   11    11     0   9    0     22  0.500000
counselor    18090    9     9     0   0    0     18  0.500000
------TRP-------
           year  fem  masc  they   it  you  total  fem_frac
word                

### Words with "it" 

In [383]:
df.sort_values(by=['it'], ascending = False).head(15)

Unnamed: 0,year,community,word,fem,masc,they,it,you
27024,2018,MGTOW,mgtow,354,657,431,3230,6
45598,2019,MGTOW,mgtow,309,443,365,2513,2
33362,2017,MGTOW,mgtow,236,484,327,2164,2
26988,2018,Incels,incels,140,179,3019,1097,4
15892,2016,MGTOW,mgtow,82,199,170,989,3
27124,2018,Incels,incel,170,1167,153,947,1
1722,2013,MRA,mrm,15,22,54,933,0
52965,2015,mgtow,mgtow,93,189,123,887,4
49822,2017,mgtow,mgtow,89,198,127,878,5
50430,2016,mgtow,mgtow,58,151,93,767,1


### Gender over time 

In [347]:
# These are the words whose % fem ranges over time are the largest,
# maybe top three words with biggest % range in each community, e.g. “cat	10% - 50%”, only calculate fraction 
# if there are more than 10 occurrences in each community and month. 

def show_top_fem_range(cat):
    # filtering for the argument category
    cat_df = df[df.community == cat]
    totals = cat_df.groupby(['year', 'word'], as_index = False).sum()
    totals['total'] = totals['fem'] + totals['masc']
    totals = totals[totals['total'] > 10]
    totals['fem_frac'] = totals['fem'] / (totals['fem'] + totals['masc'])
    
    # filter for words that show up in more than 1 of the months/time periods 
    # (initially picked 85 to be more than half of all the months but idk if needed)
    is_multi = totals["word"].value_counts() > 1
    filtered = totals[totals["word"].isin(is_multi[is_multi].index)]
    
    # get the max and min fem_frac for each word
    word_keys = filtered['word'].unique().tolist()
    max_fems = []
    min_fems = []
    max_months = []
    min_months = []
    for word in word_keys: 
        df_subset = filtered[filtered['word'] == word]
        max_fem = df_subset['fem_frac'].max()
        min_fem = df_subset['fem_frac'].min()
        max_month = df_subset[df_subset['fem_frac'] == max_fem]['year'].max()
        min_month = df_subset[df_subset['fem_frac'] == min_fem]['year'].min()
        
        max_fems.append(max_fem)
        min_fems.append(min_fem)
        max_months.append(max_month)
        min_months.append(min_month)
    
    
    d = {'word': [], 'min month': [], 'min': [], 'max month': [], 'max':[], 'diff': []}
    for i in range(len(word_keys)):
        d['word'].append(word_keys[i])
        d['min month'].append(min_months[i])
        d['min'].append(min_fems[i])
        d['max month'].append(max_months[i])
        d['max'].append(max_fems[i])
        d['diff'].append(max_fems[i] - min_fems[i])
    
    diffs = pd.DataFrame(data=d)
    return diffs.sort_values(by = ['diff'], ascending = False)


In [348]:
# words in each month in Incels that appear more than 10 times in that month
for cat in cats: 
    print('------' + cat + '-------')
    print(show_top_fem_range(cat).head())

------Femcels-------
       word  min month       min  max month       max      diff
11   doctor       2018  0.153846       2019  0.392857  0.239011
35  partner       2018  0.320000       2019  0.549451  0.229451
15   female       2018  0.705882       2019  0.894737  0.188854
17  femcels       2018  0.312500       2019  0.461538  0.149038
37      sis       2019  0.606061       2018  0.733333  0.127273
------the_attraction-------
           word  min month       min  max month       max      diff
136      waiter       2007  0.071429       2008  0.562500  0.491071
22      asshole       2006  0.421053       2010  0.909091  0.488038
147  one friend       2008  0.166667       2009  0.615385  0.448718
128    stranger       2006  0.285714       2009  0.727273  0.441558
64          gal       2006  0.500000       2007  0.923077  0.423077
------TRP-------
            word  min month       min  max month       max      diff
398       spouse       2019  0.200000       2016  0.816901  0.616901
442 

### Gender differences

Lucy hasn't edited this section

In [54]:
# len(df.word.unique()
df = df.groupby('word').sum()
df['total'] = df['fem'] + df['masc']
df = df[df['total'] > 10] 
df['fem_frac'] = df['fem'] / (df['fem'] + df['masc'])
df = df.sort_values(by=['fem'], ascending = False)
df


Unnamed: 0_level_0,fem,masc,neut,total,fem_frac
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
girl,308205,2567,226,310772,0.991740
woman,248514,1024,838,249538,0.995896
wife,74069,576,55,74645,0.992283
mother,31271,275,139,31546,0.991283
mom,23830,893,186,24723,0.963880
...,...,...,...,...,...
physicist,0,29,1,29,0.000000
tall man,0,59,1,59,0.000000
soccer player,0,11,0,11,0.000000
great leader,0,12,0,12,0.000000


In [56]:
control_df = control_df.groupby('word').sum()
control_df['total'] = control_df['fem'] + control_df['masc']
control_df = control_df[control_df['total'] > 10] 
control_df['fem_frac'] = control_df['fem'] / (control_df['fem'] + control_df['masc'])
control_df = control_df.sort_values(by=['fem'], ascending = False)
control_df


Unnamed: 0_level_0,fem,masc,neut,total,fem_frac
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mom,22693,764,200,23457,0.967430
wife,22389,165,18,22554,0.992684
girl,22147,269,29,22416,0.988000
woman,17238,131,56,17369,0.992458
mother,13385,107,53,13492,0.992069
...,...,...,...,...,...
composer,0,29,7,29,0.000000
common man,0,34,1,34,0.000000
colonel,0,25,0,25,0.000000
college kid,0,13,1,13,0.000000


In [57]:
merged_df = df.merge(control_df, how='inner', left_index=True, right_index=True)
merged_df

Unnamed: 0_level_0,fem_x,masc_x,neut_x,total_x,fem_frac_x,fem_y,masc_y,neut_y,total_y,fem_frac_y
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
girl,308205,2567,226,310772,0.991740,22147,269,29,22416,0.988000
woman,248514,1024,838,249538,0.995896,17238,131,56,17369,0.992458
wife,74069,576,55,74645,0.992283,22389,165,18,22554,0.992684
mother,31271,275,139,31546,0.991283,13385,107,53,13492,0.992069
mom,23830,893,186,24723,0.963880,22693,764,200,23457,0.967430
...,...,...,...,...,...,...,...,...,...,...
layman,0,11,0,11,0.000000,1,11,0,12,0.083333
first man,0,69,0,69,0.000000,0,31,0,31,0.000000
physicist,0,29,1,29,0.000000,1,22,2,23,0.043478
tall man,0,59,1,59,0.000000,0,17,0,17,0.000000


In [59]:
merged_df['difference'] = (merged_df['fem_frac_x'] - merged_df['fem_frac_y']).abs()
merged_df = merged_df.sort_values(by=['difference'], ascending = False)

In [61]:
merged_df

Unnamed: 0_level_0,fem_x,masc_x,neut_x,total_x,fem_frac_x,fem_y,masc_y,neut_y,total_y,fem_frac_y,difference
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
brat,21,5,1,26,0.807692,2,10,1,12,0.166667,0.641026
expert,68,35,25,103,0.660194,8,48,31,56,0.142857,0.517337
band,15,2,382,17,0.882353,8,13,1799,21,0.380952,0.501401
cunt,789,205,60,994,0.793763,35,77,12,112,0.312500,0.481263
sitter,11,3,1,14,0.785714,7,14,0,21,0.333333,0.452381
...,...,...,...,...,...,...,...,...,...,...,...
trans woman,131,0,1,131,1.000000,73,0,4,73,1.000000,0.000000
grown woman,120,0,5,120,1.000000,32,0,0,32,1.000000,0.000000
own daughter,112,0,0,112,1.000000,48,0,0,48,1.000000,0.000000
great woman,103,0,0,103,1.000000,12,0,0,12,1.000000,0.000000


### Pronoun sparsity

Lucy hasn't edited this section

In [84]:
df = pd.read_csv('pronoun_df.csv')
# df = df.groupby('word').sum()
# df.shape[0]

# total vocab words that show up in reddit with masc/fem/neut pronouns is 6373

df_totals = df.groupby('word').sum()
# df_totals['total'] = df_totals['fem'] + df_totals['masc'] 
df_totals['total'] = df_totals['fem'] + df_totals['masc'] + df_totals['neut']

df_totals['neut_frac'] = df_totals['neut'] / (df_totals['fem'] + df_totals['masc'] + df_totals['neut'])
df_totals 

df_neut = df_totals[df_totals['neut_frac'] >= 0.5].sort_values(by = ['neut_frac'], ascending = False)
# df_neut.head(20)
df_neut

# df_sparse = df_totals[df_totals['total'] <= 10]
# df_sparse



# 4300 words have less than 10 occurrences with masc or fem pronouns => ~70%
# 1438 words have less than 10 occurrences with masc or fem or neut pronouns => ~23%

# 3716 words are mostly "they" words => ~60%



Unnamed: 0_level_0,fem,masc,neut,total,neut_frac
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
zombies,0,0,97,97,1.0
monarchs,0,0,18,18,1.0
mobile users,0,0,1,1,1.0
mockingbird,0,0,1,1,1.0
moderate feminists,0,0,86,86,1.0
...,...,...,...,...,...
walking wallet,2,0,2,4,0.5
total loser,0,2,2,4,0.5
changs,0,2,2,4,0.5
chick magnet,1,0,1,2,0.5


### Evaluation 

This code compares booknlp coref vs. spacy coref on hand-labeled data.

In [212]:
gold_masc = set()
gold_fem = set()
with open(ROOT + 'logs/gender_gold_labels.csv', 'r') as infile: 
    reader = csv.DictReader(infile)
    for row in reader: 
        if row['gendered?'] == 'm':
            gold_masc.add(row['word (singular)'].lower())
        if row['gendered?'] == 'f': 
            gold_fem.add(row['word (singular)'].lower())

In [213]:
david_labels = Counter()
with open(ROOT + 'logs/temp_gender.txt', 'r') as infile: 
    reader = csv.DictReader(infile, delimiter='\t')
    for row in reader: 
        if row['proper'] != 'nom': continue
        if (float(row['he/him/his']) + float(row['she/her'])) < 3: continue
        david_labels[row['term']] = float(row['she/her']) / (float(row['he/him/his']) + float(row['she/her']))

In [214]:
df = pd.read_csv(COREF_RESULTS + 'coref_reddit_df.csv')

In [215]:
df = df.groupby('word').sum()
df['fem_frac'] = df['fem'] / (df['fem'] + df['masc'])
df = df[['fem_frac']].dropna()
df = df.to_dict()

In [216]:
spacy_labels = df['fem_frac']

In [219]:
# average score for m words
spacy_scores = []
david_scores = []
for w in gold_masc: 
    if w in spacy_labels and w in david_labels: 
        spacy_scores.append(spacy_labels[w])
        david_scores.append(david_labels[w])
print("masc words")
print("SPACY:", np.mean(spacy_scores), "BOOKNLP:", np.mean(david_scores))

masc words
SPACY: 0.20069466570475966 BOOKNLP: 0.31480068170482933


In [221]:
# average score for f words
spacy_scores = []
david_scores = []
for w in gold_fem: 
    if w in spacy_labels and w in david_labels: 
        spacy_scores.append(spacy_labels[w])
        david_scores.append(david_labels[w])
print("fem words")
print("SPACY:", np.mean(spacy_scores), "BOOKNLP:", np.mean(david_scores))

fem words
SPACY: 0.8515987971507715 BOOKNLP: 0.7529589257733309
