In [5]:
from collections import defaultdict, Counter
from tqdm import tqdm
import csv
import os
import pandas as pd

In [29]:
ROOT = '/mnt/data0/lucy/manosphere/'
ANN_FILE = ROOT + 'data/ann_sig_entities.csv'
COREF_LOGS = '/mnt/data0/dtadimeti/manosphere/logs/'
COREF_REDDIT = COREF_LOGS + 'coref_reddit/'
COREF_FORUMS = COREF_LOGS + 'coref_forums/'
COREF_CONTROL = COREF_LOGS + 'coref_control/'
SUB_META = ROOT + 'data/subreddits.txt'
COREF_RESULTS = ROOT + 'logs/coref_results/'

In [26]:
# Get subreddit categories 
categories = defaultdict(str)
categories_rev = defaultdict(list)
with open(SUB_META, 'r') as infile: 
    reader = csv.DictReader(infile)
    for row in reader: 
        name = row['Subreddit'].strip().lower()
        if name.startswith('/r/'): name = name[3:]
        if name.startswith('r/'): name = name[2:]
        if name.endswith('/'): name = name[:-1]
        categories[name] = row['Category after majority agreement']
        categories_rev[row['Category after majority agreement']].append(name)

In [30]:
df = pd.read_csv(COREF_RESULTS + 'coref_reddit_df.csv')
df['category'] = df['community'].map(categories) 
df = df[~df.category.isin(['Health', 'Criticism'])]
df.category.unique()

array(['PUA', 'MRA', 'Incels', 'TRP', 'MGTOW', 'Femcels', 'FDS', ''],
      dtype=object)

Additional errors.

In [31]:
df[df['category'] == '']

Unnamed: 0,month,community,word,fem,masc,neut,it,you,category
159429,2010-03,$the woman,man,0,1,0,0,0,
176806,2017-02,$themselves$these people$they$they,someone,0,1,0,0,0,
285747,2010-06,$that child,woman,1,0,0,0,0,
292352,2016-08,express himself to her$himself$that man,new man,0,1,0,0,0,
292353,2016-08,express himself to her$himself$that man,man,0,1,0,0,0,
357428,2010-11,woman flirts and flirts$she$she$she$her$she$sh...,guy,0,2,0,0,0,
357429,2010-11,woman flirts and flirts$she$she$she$her$she$sh...,man,0,1,0,0,0,
357430,2010-11,woman flirts and flirts$she$she$she$her$she$sh...,men,0,0,0,0,0,
364128,2009-12,father,mother,1,0,0,0,0,
364129,2009-12,father,young children,0,0,0,0,0,


### Popular words in control

In [53]:
control_df = pd.read_csv('pronoun_control_df.csv')

def show_top_fem_control(): 
    control_totals = control_df.groupby('word').sum()
    control_totals['total'] = control_totals['fem'] + control_totals['masc'] 
    # filter to only those that appear more than 10 times as she or he
    control_totals = control_totals[control_totals['total'] > 10] 
    control_totals['fem_frac'] = control_totals['fem'] / (control_totals['fem'] + control_totals['masc'])
    control_fem = control_totals[control_totals['fem_frac'] == 1]
    return control_fem.sort_values(by=['total'], ascending = False)

def show_top_masc_control(): 
    control_totals = control_df.groupby('word').sum()
    control_totals['total'] = control_totals['fem'] + control_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    control_totals = control_totals[control_totals['total'] > 10] 
    control_totals['masc_frac'] = control_totals['masc'] / (control_totals['fem'] + control_totals['masc'])
    control_masc = control_totals[control_totals['masc_frac'] == 1]
    return control_masc.sort_values(by=['total'], ascending = False)

def show_top_neut_control(): 
    control_totals = control_df.groupby('word').sum()
    control_totals['total'] = control_totals['fem'] + control_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    control_totals = control_totals[control_totals['total'] > 10] 
    control_totals['fem_frac'] = control_totals['fem'] / (control_totals['fem'] + control_totals['masc'])
    control_neut = control_totals[control_totals['fem_frac'].between(0.45, 0.55)]
    return control_neut.sort_values(by=['total'], ascending = False)

In [None]:
show_top_neut_control()

### Popular fem words in reddit

In [None]:
def show_top_fem(cat): 
    cat_df = df[df.category == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['fem_frac'] = cat_totals['fem'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_fem = cat_totals[cat_totals['fem_frac'] == 1]
    return cat_fem.sort_values(by=['total'])

In [None]:
# The Red Pill
show_top_fem('TRP')

In [None]:
# Men's Rights Activists
show_top_fem('MRA')

In [None]:
# Pick Up Artists
show_top_fem('PUA')

In [None]:
# Incels
show_top_fem('Incels')

In [None]:
# Men Who Go Their Own Way
show_top_fem('MGTOW')

In [None]:
# Femcels
show_top_fem('Femcels')

In [None]:
# FDS
show_top_fem('FDS')

### Popular masc words in reddit

In [None]:
def show_top_masc(cat): 
    cat_df = df[df.category == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['masc_frac'] = cat_totals['masc'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_masc = cat_totals[cat_totals['masc_frac'] == 1]
    return cat_masc.sort_values(by=['total'])

In [None]:
# The Red Pill
show_top_masc('TRP')

In [None]:
# MRA
show_top_masc('MRA')

In [None]:
# PUA
show_top_masc('PUA')

In [None]:
# Incels
show_top_masc('Incels')

In [None]:
# MGTOW
show_top_masc('MGTOW')

In [None]:
# Femcels
show_top_masc('Femcels')

In [None]:
# FDS
show_top_masc('FDS')

### Popular neut words in reddit

In [None]:
def show_top_neut(cat): 
    cat_df = df[df.category == cat]
    cat_totals = cat_df.groupby('word').sum()
    cat_totals['total'] = cat_totals['fem'] + cat_totals['masc'] 
    # filter to only those that appear at least 10 times as she or he
    cat_totals = cat_totals[cat_totals['total'] > 10] 
    cat_totals['fem_frac'] = cat_totals['fem'] / (cat_totals['fem'] + cat_totals['masc'])
    cat_neut = cat_totals[cat_totals.fem_frac.between(0.45, 0.55)]
    return cat_neut.sort_values(by=['total'], ascending = False)

In [None]:
# The Red Pill
show_top_neut('TRP')

In [None]:
show_top_neut('MRA')

In [None]:
show_top_neut('PUA')

In [None]:
show_top_neut('Incels')

In [None]:
show_top_neut('MGTOW')

In [None]:
show_top_neut('Femcels')

In [None]:
show_top_neut('FDS')

### Gender over time 

In [None]:
# These are the words whose % fem ranges over time are the largest,
# maybe top three words with biggest % range in each community, e.g. “cat	10% - 50%”, only calculate fraction 
# if there are more than 10 occurrences in each community and month. 

def show_top_fem_range(cat):
    # filtering for the argument category
    cat_df = df1[df1.category == cat]
    totals = cat_df.groupby(['month', 'word'], as_index = False).sum()
    totals['total'] = totals['fem'] + totals['masc']
    totals = totals[totals['total'] > 10]
    totals['fem_frac'] = totals['fem'] / (totals['fem'] + totals['masc'])
    
    # filter for words that show up in more than 1 of the months/time periods 
    # (initially picked 85 to be more than half of all the months but idk if needed)
    is_multi = totals["word"].value_counts() > 1
    filtered = totals[totals["word"].isin(is_multi[is_multi].index)]
    
    # get the max and min fem_frac for each word
    word_keys = filtered['word'].unique().tolist()
    max_fems = []
    min_fems = []
    max_months = []
    min_months = []
    for word in word_keys: 
        df_subset = filtered[filtered['word'] == word]
        max_fem = df_subset['fem_frac'].max()
        min_fem = df_subset['fem_frac'].min()
        max_month = df_subset[df_subset['fem_frac'] == max_fem]['month'].max()
        min_month = df_subset[df_subset['fem_frac'] == min_fem]['month'].min()
        
        max_fems.append(max_fem)
        min_fems.append(min_fem)
        max_months.append(max_month)
        min_months.append(min_month)
    
    
    d = {'word': [], 'min month': [], 'min': [], 'max month': [], 'max':[], 'diff': []}
    for i in range(len(word_keys)):
        d['word'].append(word_keys[i])
        d['min month'].append(min_months[i])
        d['min'].append(min_fems[i])
        d['max month'].append(max_months[i])
        d['max'].append(max_fems[i])
        d['diff'].append(max_fems[i] - min_fems[i])
    
    diffs = pd.DataFrame(data=d)
    return diffs.sort_values(by = ['diff'], ascending = False)


In [None]:
# words in each month in Incels that appear more than 10 times in that month
show_top_fem_range("Incels")

In [None]:
show_top_fem_range("TRP")

In [None]:
show_top_fem_range("MRA")

In [None]:
show_top_fem_range("PUA")

In [None]:
show_top_fem_range("MGTOW")

In [None]:
show_top_fem_range("MGTOW")

In [None]:
show_top_fem_range("Femcels")

In [None]:
show_top_fem_range("FDS")

### Gender differences

In [54]:
# len(df.word.unique()
df = df.groupby('word').sum()
df['total'] = df['fem'] + df['masc']
df = df[df['total'] > 10] 
df['fem_frac'] = df['fem'] / (df['fem'] + df['masc'])
df = df.sort_values(by=['fem'], ascending = False)
df


Unnamed: 0_level_0,fem,masc,neut,total,fem_frac
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
girl,308205,2567,226,310772,0.991740
woman,248514,1024,838,249538,0.995896
wife,74069,576,55,74645,0.992283
mother,31271,275,139,31546,0.991283
mom,23830,893,186,24723,0.963880
...,...,...,...,...,...
physicist,0,29,1,29,0.000000
tall man,0,59,1,59,0.000000
soccer player,0,11,0,11,0.000000
great leader,0,12,0,12,0.000000


In [56]:
control_df = control_df.groupby('word').sum()
control_df['total'] = control_df['fem'] + control_df['masc']
control_df = control_df[control_df['total'] > 10] 
control_df['fem_frac'] = control_df['fem'] / (control_df['fem'] + control_df['masc'])
control_df = control_df.sort_values(by=['fem'], ascending = False)
control_df


Unnamed: 0_level_0,fem,masc,neut,total,fem_frac
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mom,22693,764,200,23457,0.967430
wife,22389,165,18,22554,0.992684
girl,22147,269,29,22416,0.988000
woman,17238,131,56,17369,0.992458
mother,13385,107,53,13492,0.992069
...,...,...,...,...,...
composer,0,29,7,29,0.000000
common man,0,34,1,34,0.000000
colonel,0,25,0,25,0.000000
college kid,0,13,1,13,0.000000


In [57]:
merged_df = df.merge(control_df, how='inner', left_index=True, right_index=True)
merged_df

Unnamed: 0_level_0,fem_x,masc_x,neut_x,total_x,fem_frac_x,fem_y,masc_y,neut_y,total_y,fem_frac_y
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
girl,308205,2567,226,310772,0.991740,22147,269,29,22416,0.988000
woman,248514,1024,838,249538,0.995896,17238,131,56,17369,0.992458
wife,74069,576,55,74645,0.992283,22389,165,18,22554,0.992684
mother,31271,275,139,31546,0.991283,13385,107,53,13492,0.992069
mom,23830,893,186,24723,0.963880,22693,764,200,23457,0.967430
...,...,...,...,...,...,...,...,...,...,...
layman,0,11,0,11,0.000000,1,11,0,12,0.083333
first man,0,69,0,69,0.000000,0,31,0,31,0.000000
physicist,0,29,1,29,0.000000,1,22,2,23,0.043478
tall man,0,59,1,59,0.000000,0,17,0,17,0.000000


In [59]:
merged_df['difference'] = (merged_df['fem_frac_x'] - merged_df['fem_frac_y']).abs()
merged_df = merged_df.sort_values(by=['difference'], ascending = False)

In [61]:
merged_df

Unnamed: 0_level_0,fem_x,masc_x,neut_x,total_x,fem_frac_x,fem_y,masc_y,neut_y,total_y,fem_frac_y,difference
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
brat,21,5,1,26,0.807692,2,10,1,12,0.166667,0.641026
expert,68,35,25,103,0.660194,8,48,31,56,0.142857,0.517337
band,15,2,382,17,0.882353,8,13,1799,21,0.380952,0.501401
cunt,789,205,60,994,0.793763,35,77,12,112,0.312500,0.481263
sitter,11,3,1,14,0.785714,7,14,0,21,0.333333,0.452381
...,...,...,...,...,...,...,...,...,...,...,...
trans woman,131,0,1,131,1.000000,73,0,4,73,1.000000,0.000000
grown woman,120,0,5,120,1.000000,32,0,0,32,1.000000,0.000000
own daughter,112,0,0,112,1.000000,48,0,0,48,1.000000,0.000000
great woman,103,0,0,103,1.000000,12,0,0,12,1.000000,0.000000


### Pronoun sparsity

In [84]:
df = pd.read_csv('pronoun_df.csv')
# df = df.groupby('word').sum()
# df.shape[0]

# total vocab words that show up in reddit with masc/fem/neut pronouns is 6373

df_totals = df.groupby('word').sum()
# df_totals['total'] = df_totals['fem'] + df_totals['masc'] 
df_totals['total'] = df_totals['fem'] + df_totals['masc'] + df_totals['neut']

df_totals['neut_frac'] = df_totals['neut'] / (df_totals['fem'] + df_totals['masc'] + df_totals['neut'])
df_totals 

df_neut = df_totals[df_totals['neut_frac'] >= 0.5].sort_values(by = ['neut_frac'], ascending = False)
# df_neut.head(20)
df_neut

# df_sparse = df_totals[df_totals['total'] <= 10]
# df_sparse



# 4300 words have less than 10 occurrences with masc or fem pronouns => ~70%
# 1438 words have less than 10 occurrences with masc or fem or neut pronouns => ~23%

# 3716 words are mostly "they" words => ~60%



Unnamed: 0_level_0,fem,masc,neut,total,neut_frac
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
zombies,0,0,97,97,1.0
monarchs,0,0,18,18,1.0
mobile users,0,0,1,1,1.0
mockingbird,0,0,1,1,1.0
moderate feminists,0,0,86,86,1.0
...,...,...,...,...,...
walking wallet,2,0,2,4,0.5
total loser,0,2,2,4,0.5
changs,0,2,2,4,0.5
chick magnet,1,0,1,2,0.5
