In [38]:
import numpy as np
import os
from collections import defaultdict, Counter
import csv
import pandas as pd

In [44]:
ROOT = '/mnt/data0/lucy/manosphere/'
COREF_LOGS = '/mnt/data0/dtadimeti/manosphere/logs/'
COREF_REDDIT = COREF_LOGS + 'coref_reddit/'
SUB_META = ROOT + 'data/subreddits.txt'
COREF_RESULTS = ROOT + 'logs/coref_results/'

In [3]:
categories = defaultdict(str)
categories_rev = defaultdict(list)
with open(SUB_META, 'r') as infile: 
    reader = csv.DictReader(infile)
    for row in reader: 
        name = row['Subreddit'].strip().lower()
        if name.startswith('/r/'): name = name[3:]
        if name.startswith('r/'): name = name[2:]
        if name.endswith('/'): name = name[:-1]
        categories[name] = row['Category after majority agreement']
        categories_rev[row['Category after majority agreement']].append(name)

In [4]:
pronoun_map = {}
fem = set(['she', 'her', 'hers', 'herself'])
for p in fem: 
    pronoun_map[p] = 'fem'
masc = set(['he', 'him', 'his', 'himself'])
for p in masc: 
    pronoun_map[p] = 'masc'
they = set(['they', 'them', 'their', 'theirs', 'themself', 'themselves'])
for p in they: 
    pronoun_map[p] = 'they'
it = set(['it', 'its', 'itself'])
for p in it: 
    pronoun_map[p] = 'it'
you = set(['you', 'your', 'yours', 'yourself', 'yourselves'])
for p in you: 
    pronoun_map[p] = 'you'

In [97]:
ratios = {} # {year-month : fem frac}
all_fem_count = 0 # total
all_masc_count = 0 # total
boyfriend_count = 0
for year_month in os.listdir(COREF_REDDIT): 
    boyfriend_lines = [] # clusters that contain 'boyfriend' substring
    with open(COREF_REDDIT + year_month, 'r') as infile: 
        for line in infile: 
            # very crude check
            contents = line.strip().lower().split('\t')
            if len(contents) <= 1: continue
            if categories[contents[0]] != 'Health' and categories[contents[0]] != 'Criticism': 
                for clust in contents: 
                    if 'boyfriend' in clust: 
                        boyfriend_lines.append(clust)
    pronoun_count = Counter()
    
    for clust in boyfriend_lines: 
        items = set(clust.split('$'))
        # is a cluster that contains 'boyfriend' as a span or proceeded by one other token
        is_boyfriend_cluster = False
        pronouns = set()
        for item in items: 
            w_tokens = item.split(' ')
            if len(w_tokens) > 3: continue
            # 'the wife' -> wife, 'the hot wife' -> hot wife
            w_except_first = ' '.join(w_tokens[1:])
            # 'hot wife' -> wife
            last_token = w_tokens[-1]
            if item == 'boyfriend' or w_except_first == 'boyfriend' or last_token =='boyfriend': 
                is_boyfriend_cluster = True
            if item in pronoun_map: 
                pronouns.add(pronoun_map[item])
        if is_boyfriend_cluster: 
            boyfriend_count += 1
            for p in pronouns: 
                pronoun_count[p] += 1

    fem_count = pronoun_count['fem']
    masc_count = pronoun_count['masc']
    if (fem_count + masc_count) != 0: 
        ratios[year_month] = fem_count / (fem_count + masc_count)
    all_fem_count += fem_count
    all_masc_count += masc_count
# print(ratios)
print(boyfriend_count)
print(len(boyfriend_lines))

17576
11


In [98]:
sorted(list(ratios.values()))

[0.0,
 0.0,
 0.0,
 0.1,
 0.16666666666666666,
 0.25,
 0.25,
 0.26785714285714285,
 0.2727272727272727,
 0.2773722627737226,
 0.27848101265822783,
 0.28169014084507044,
 0.2916666666666667,
 0.2926829268292683,
 0.29545454545454547,
 0.2956989247311828,
 0.29931972789115646,
 0.30357142857142855,
 0.31092436974789917,
 0.3111111111111111,
 0.3125,
 0.31693989071038253,
 0.3175355450236967,
 0.32298136645962733,
 0.3271604938271605,
 0.33004926108374383,
 0.3310810810810811,
 0.3333333333333333,
 0.3333333333333333,
 0.3375,
 0.3392857142857143,
 0.3416666666666667,
 0.3431952662721893,
 0.34375,
 0.34415584415584416,
 0.3445378151260504,
 0.3448275862068966,
 0.34523809523809523,
 0.34782608695652173,
 0.3489583333333333,
 0.35119047619047616,
 0.35119047619047616,
 0.35135135135135137,
 0.3516483516483517,
 0.3522727272727273,
 0.3523809523809524,
 0.3532934131736527,
 0.35348837209302325,
 0.35751295336787564,
 0.3582089552238806,
 0.3588235294117647,
 0.3588235294117647,
 0.360294117

In [99]:
1-(all_fem_count / (all_fem_count + all_masc_count))

0.6223021582733813

In [100]:
all_fem_count

5145

### Playing around with evaluation

In [42]:
gold_masc = set()
gold_fem = set()
with open(ROOT + 'logs/gender_gold_labels.csv', 'r') as infile: 
    reader = csv.DictReader(infile)
    for row in reader: 
        if row['gendered?'] == 'm':
            gold_masc.add(row['word (plural)'].lower())
            gold_masc.add(row['word (singular)'].lower())
        if row['gendered?'] == 'f': 
            gold_fem.add(row['word (plural)'].lower())
            gold_fem.add(row['word (singular)'].lower())

In [43]:
david_labels = Counter()
with open(ROOT + 'logs/temp_gender.txt', 'r') as infile: 
    reader = csv.DictReader(infile, delimiter='\t')
    for row in reader: 
        if row['proper'] != 'nom': continue
        if (float(row['he/him/his']) + float(row['she/her'])) < 3: continue
        david_labels[row['term']] = float(row['she/her']) / (float(row['he/him/his']) + float(row['she/her']))

In [35]:
print(david_labels['boyfriend'])

0.11035301645113263


In [62]:
df = pd.read_csv(COREF_RESULTS + 'coref_reddit_df.csv')

In [63]:
df = df.groupby('word').sum()
df['fem_frac'] = df['fem'] / (df['fem'] + df['masc'])
df = df[['fem_frac']].dropna()
df = df.to_dict()

In [65]:
spacy_labels = df['fem_frac']

In [66]:
print(spacy_labels['boyfriend'])

0.20835285848172447


In [79]:
# average score for m words
spacy_scores = []
david_scores = []
for w in gold_masc: 
    if w in spacy_labels and w in david_labels: 
        spacy_scores.append(spacy_labels[w])
        david_scores.append(david_labels[w])

In [81]:
print("masc words")
print("SPACY:", np.mean(spacy_scores), "BOOKNLP:", np.mean(david_scores))

masc words
SPACY: 0.3444542404438327 BOOKNLP: 0.4178711174822399


In [82]:
# average score for f words
spacy_scores = []
david_scores = []
for w in gold_fem: 
    if w in spacy_labels and w in david_labels: 
        spacy_scores.append(spacy_labels[w])
        david_scores.append(david_labels[w])

In [83]:
print("fem words")
print("SPACY:", np.mean(spacy_scores), "BOOKNLP:", np.mean(david_scores))

fem words
SPACY: 0.8207216951656394 BOOKNLP: 0.6975304808729251
