In [21]:
import numpy as np
import json
from collections import defaultdict, Counter
from scipy import spatial, stats
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from adjustText import adjust_text

In [65]:
ROOT = '/mnt/data0/lucy/manosphere/'
DATA = ROOT + 'data/'
GLOVE = DATA + 'glove/'
LOGS = ROOT + 'logs/'
AGG_EMBED_PATH = LOGS + 'semantics_mano/agg_embed/'

# Variance within gender

In [80]:
with open(LOGS + 'semantics_mano/results/scores.json', 'r') as infile: 
    scores = json.load(infile)

vocab_order = []
with open(LOGS + 'semantics_mano/results/vocab_order.txt', 'r') as infile:
    vocab_order = infile.readlines()

with open(LOGS + 'coref_results/mano_gender_labels.json', 'r') as infile: 
    gender_labels = json.load(infile)

In [81]:
fem_variance = Counter()
fem_extremes = defaultdict(tuple)
masc_variance = Counter()
masc_extremes = defaultdict(tuple)
N = 5
for pole in scores: 
    s = scores[pole]
    x = [] # fem scores
    x_words = []
    y = [] # masc scores
    y_words = []
    for i, term in enumerate(vocab_order): 
        term = term.strip()
        if term in gender_labels: 
            if gender_labels[term] > 0.75: 
                x.append(s[i])
                x_words.append(term)
            elif gender_labels[term] < 0.25: 
                y.append(s[i])
                y_words.append(term)
    fem_variance[pole] = np.var(x)
    indices = np.argpartition(x, -N)[-N:]
    topN = [x_words[idx].strip() for idx in indices]
    indices = np.argpartition(x, N)[:N]
    bottomN = [x_words[idx].strip() for idx in indices]
    fem_extremes[pole] = (topN, bottomN)
    
    masc_variance[pole] = np.var(y)
    indices = np.argpartition(y, -N)[-N:]
    topN = [y_words[idx].strip() for idx in indices]
    indices = np.argpartition(y, N)[:N]
    bottomN = [y_words[idx].strip() for idx in indices]
    masc_extremes[pole] = (topN, bottomN)

In [82]:
for tup in fem_variance.most_common(20): 
    print(tup)
    pole = tup[0]
    topN, bottomN = fem_extremes[pole]
    print("TOP:", topN)
    print("BOTTOM:", bottomN)
    print()

('womanly.a.01', 0.0207367739111438)
TOP: ['hambeasts', 'downer', 'hambeast', 'tomboys', 'tomboy']
BOTTOM: ['female', 'feminine woman', 'female gender', 'feminine women', 'female feminist']

('androgynous.a.02', 0.01048472108047411)
TOP: ['black females', 'female', 'females', 'white females', 'two females']
BOTTOM: ['manipulative bitch', 'nympho', 'noodlewhore', 'supermodel', 'noodlewhores']

('lovable.a.01', 0.010376138902053338)
TOP: ['stupid cunts', 'accusers', 'woman haters', 'women haters', 'degenerate whores']
BOTTOM: ['beautiful girl', 'little princess', 'sweet girl', 'cute friend', 'cute girl']

('reputable.a.01', 0.008525157638178259)
TOP: ['slutty women', 'slut', 'sluts', 'dumb sluts', 'dirty slut']
BOTTOM: ['great wife', 'great woman', 'great women', 'great dancer', 'good wives']

('wholesome.a.01', 0.00843521055871161)
TOP: ['manipulative bitch', 'dumb sluts', 'evil bitch', 'dirty slut', 'degenerate whores']
BOTTOM: ['homemaker', 'homemakers', 'healthy woman', 'healthy wome

In [83]:
for tup in masc_variance.most_common(20): 
    print(tup)
    pole = tup[0]
    topN, bottomN = masc_extremes[pole]
    print("TOP:", topN)
    print("BOTTOM:", bottomN)
    print()

('androgynous.a.02', 0.012188110400550777)
TOP: ['masculine male', 'dominant males', 'one male', 'male', 'males']
BOTTOM: ['manipulator', 'hypocrites', 'hypocrite', 'fucking hypocrites', 'fucking hypocrite']

('womanly.a.01', 0.01105276724502953)
TOP: ['faggot', 'buffoon', 'goober', 'wimp', 'wanker']
BOTTOM: ['male feminist', 'male gender', 'feminine men', 'feminist men', 'feminist leaders']

('reputable.a.01', 0.009900774126213017)
TOP: ['muggers', 'mugger', 'misogynists', 'wimp', 'misogynist']
BOTTOM: ['great men', 'great people', 'great leader', 'great man', 'great person']

('lovable.a.01', 0.009518692849910448)
TOP: ['misogynists', 'perpetrator', 'hater', 'traitors', 'haters']
BOTTOM: ['sweet guy', 'little bro', 'pretty boy', 'little angel', 'beautiful man']

('wholesome.a.01', 0.00925771584422821)
TOP: ['zealots', 'misogynist', 'fucking hypocrite', 'buffoon', 'fucking hypocrites']
BOTTOM: ['healthy male', 'healthy man', 'healthy men', 'farmers', 'intact men']

('intellectual.a.02

In [89]:
with open(LOGS + 'semantics_mano/pca_fem_poles.json', 'r') as infile: 
    fem_pca_poles = json.load(infile)
with open(LOGS + 'semantics_mano/pca_masc_poles.json', 'r') as infile: 
    masc_pca_poles = json.load(infile)

In [90]:
pca_masc = np.load(AGG_EMBED_PATH + 'pca_mano_masc.npy')
pca_fem = np.load(AGG_EMBED_PATH + 'pca_mano_fem.npy')

In [103]:
dim_diffs = defaultdict(Counter) # {dimension : {pole : diff} }
for pole in fem_pca_poles: 
    left, right = fem_pca_poles[pole]
    left = np.array(left)
    right = np.array(right)
    diff = left - right
    for i in range(left.shape[0]): 
        dim_diffs[i][pole] = abs(diff[i])
for dim in dim_diffs: 
    print(dim)
    print(dim_diffs[dim].most_common(5))

0
[('womanly.a.01', 39.70916761806507), ('androgynous.a.02', 16.757837121378916), ('convincing.a.01', 13.672964095309126), ('homemade.a.01', 11.971018907762058), ('developed.a.01', 11.81281415238901)]
1
[('beautiful.a.01', 11.968697928561195), ('optimistic.a.01', 11.91896463369236), ('lovable.a.01', 11.4815775851226), ('appealing.a.01', 9.978893650782908), ('heavy.a.04', 9.797800153776153)]
2
[('nice.a.01', 22.006922985945252), ('grateful.a.01', 18.585618714744683), ('lovable.a.01', 18.108438513182094), ('regenerate.a.01', 17.13215722101723), ('political.a.01', 16.77626718431917)]
3
[('grateful.a.01', 10.641417638707875), ('womanly.a.01', 10.304940202215407), ('spontaneous.a.01', 9.697149478540819), ('quiet.a.01', 9.632293939394955), ('partial.a.02', 9.098384434951956)]
4
[('national.a.02', 15.136571953488254), ('reciprocal.a.01', 12.188276903021219), ('free.a.02', 12.168535992332087), ('prospective.a.01', 11.758392827638431), ('partial.a.02', 11.608923897239164)]
5
[('lawful.a.01', 14

[('national.a.02', 3.3274614625472982), ('cacophonous.a.01', 3.102111968961296), ('finished.a.02', 2.8649663300875963), ('publicized.a.01', 2.576777902302642), ('frequent.a.01', 2.4568926531252857)]
647
[('holy.a.01', 2.902496327670592), ('cosmopolitan.a.02', 2.893138236766455), ('settled.a.02', 2.690121050289986), ('publicized.a.01', 2.665027510026528), ('inhabited.a.01', 2.477710771289238)]
648
[('nonproprietary.a.01', 3.5971872770025914), ('live.a.01', 3.322217049624919), ('convincing.a.01', 3.1309995567020783), ('encumbered.a.01', 2.7376187922680995), ('expressible.a.01', 2.7283090696274597)]
649
[('immediate.a.03', 4.743521056284994), ('innocent.a.01', 3.2536024616744554), ('desirable.a.01', 3.250693460719324), ('elective.a.01', 3.0245659672049046), ('live.a.01', 2.863365560150009)]
650
[('optimistic.a.01', 3.694211066025054), ('responsible.a.01', 3.507263584177511), ('encumbered.a.01', 3.0779882772178846), ('political.a.01', 3.0462266959802107), ('nascent.a.01', 2.689736872926369

1405
[('assisted.a.01', 1.9841979731710229), ('imperative.a.01', 1.513394642940242), ('articulate.a.01', 1.4469318559781446), ('productive.a.01', 1.424701308251576), ('trimmed.a.01', 1.419449029004099)]
1406
[('commensurate.a.01', 1.1695246457063946), ('long.a.01', 1.0424659721606766), ('articulate.a.01', 1.0175560923498517), ('anterior.a.01', 1.0051768226496924), ('credulous.a.01', 0.9970933285371898)]
1407
[('continuous.a.01', 2.0095416970511515), ('colorful.a.02', 1.5365988107114306), ('innocent.a.01', 1.4658916104973918), ('open.a.05', 1.4355758764351374), ('neutral.a.04', 1.414631791527468)]
1408
[('corrigible.a.01', 1.4289916456872396), ('injured.a.01', 1.183775205004166), ('potent.a.03', 1.178385734773277), ('glorious.a.01', 1.1381510367947603), ('all.a.01', 1.097554777257322)]
1409
[('afraid.a.01', 1.3391481810305108), ('true.a.01', 1.300937080706895), ('warm.a.01', 1.2423832910793764), ('clear.a.11', 1.2235313721452608), ('private.a.01', 1.183858568912623)]
1410
[('certain.a.0

In [102]:
N = 10
for dim in range(pca_fem.shape[1]): 
    indices = np.argpartition(pca_fem[:,dim], -N)[-N:]
    topN = [x_words[idx].strip() for idx in indices]
    indices = np.argpartition(pca_fem[:,dim], N)[:N]
    bottomN = [x_words[idx].strip() for idx in indices]
    print('dim:', dim)
    print(topN)
    print(bottomN)
    if dim > 5: break

dim: 0
['attractive women', 'qualified women', 'females', 'masculine women', 'lesbian women', 'heterosexual women', 'feminist women', 'feminine women', 'women women', 'women']
['tomboy', 'wuss', 'babysitter', 'mother fucker', 'tinderella', 'fuckbuddy', 'noodlewhore', 'downer', 'hambeast', 'debbie downer']
dim: 1
['awesome girl', 'hot chick', 'cute chick', 'hot girlfriend', 'cute girl', 'hot girl', 'hottest chick', 'attractive girl', 'hotter girl', 'gorgeous girl']
['homemakers', 'gender feminists', 'liberal feminists', 'radical feminists', 'moderate feminists', 'feminists', 'modern feminists', 'militant feminists', 'matriarch', 'complainants']
dim: 2
['stepmother', 'grandma', 'mom', 'mother', 'aunt', 'granddaughter', 'young daughter', 'niece', 'grandmother', 'daughter']
['cunts', 'dirty slut', 'skanks', 'fucking sluts', 'total cunt', 'total slut', 'slut', 'cunt', 'dumb sluts', 'sluts']
dim: 3
['fhos', '9 girls', '12 girls', 'hbs', '4 girls', '5 girls', 'foids', 'other hbs', '8 girls', 

In [106]:
N = 10
for dim in range(pca_masc.shape[1]): 
    indices = np.argpartition(pca_masc[:,dim], -N)[-N:]
    topN = [y_words[idx].strip() for idx in indices]
    indices = np.argpartition(pca_masc[:,dim], N)[:N]
    bottomN = [y_words[idx].strip() for idx in indices]
    print('dim:', dim)
    print(topN)
    print(bottomN)
    if dim > 5: break

dim: 0
['heterosexual men', 'handsome guys', 'masculine guys', 'handsome men', 'masculine men', 'attractive men', 'hot men', 'hot guys', 'attractive guys', 'attractive males']
['skeptic', 'manipulator', 'annarchist', 'inventor', 'executioner', 'conspiracy theorist', 'assassin', 'programmer', 'philosopher', 'tyrant']
dim: 1
['goober', 'schmuck', 'buddy boy', 'cool guy', 'cool dude', 'little fucker', 'fuckwit', 'douche', 'douche bag', 'shithead']
['mathematicians', 'accountants', 'laborers', 'politicians', 'scientists', 'monarchs', 'economists', 'bishops', 'surgeons', 'presidents']
dim: 2
['amogs', 'currycels', 'neets', 'you gents', 'blackcels', 'oldcels', 'truecels', 'bluepilled cucks', 'puas', 'sluthaters']
['salesman', 'footballer', 'accountant', 'investment banker', 'manager', 'banker', 'businessman', 'football player', 'business owner', 'entrepreneur']
dim: 3
['2 kids', 'younger brother', 'mgtow brother', 'grandson', 'young son', 'oldest son', 'nephew', 'first kid', 'son', 'little b

In [104]:
dim_diffs = defaultdict(Counter) # {dimension : {pole : diff} }
for pole in masc_pca_poles: 
    left, right = masc_pca_poles[pole]
    left = np.array(left)
    right = np.array(right)
    diff = left - right
    for i in range(left.shape[0]): 
        dim_diffs[i][pole] = abs(diff[i])
for dim in dim_diffs: 
    print(dim)
    print(dim_diffs[dim].most_common(5))

0
[('androgynous.a.02', 15.261003023293608), ('womanly.a.01', 14.203123799542151), ('prejudiced.a.02', 12.167954649107548), ('prospective.a.01', 9.750322365694963), ('active.a.03', 9.010417024095108)]
1
[('womanly.a.01', 24.050003937092946), ('homemade.a.01', 15.19668317829947), ('convincing.a.01', 14.778204431564077), ('responsible.a.01', 13.411091435698259), ('functioning.a.01', 13.046057072548797)]
2
[('commercial.a.01', 12.76119911549521), ('political.a.01', 12.141615758905388), ('enclosed.a.01', 11.984825910989263), ('educated.a.01', 11.798687475585908), ('holy.a.01', 11.5143582095064)]
3
[('grateful.a.01', 22.32039034668281), ('competent.a.01', 20.317765741703646), ('nice.a.01', 20.19844632181192), ('subordinate.a.02', 19.147807058409693), ('standard.a.01', 18.672736351354274)]
4
[('grateful.a.01', 10.788819966263738), ('shod.a.01', 10.42480892375532), ('enclosed.a.01', 8.337091282513779), ('appealing.a.01', 8.138517473743299), ('worthy.a.01', 8.106796602180534)]
5
[('womanly.a.0

[('anterior.a.01', 3.3988231851401993), ('covered.a.01', 3.0964985858945755), ('organized.a.01', 2.9347536168032584), ('dull.a.09', 2.72828208898017), ('free.a.02', 2.718218267445147)]
525
[('foreign.a.02', 4.09430188672904), ('corrigible.a.01', 3.8399077227603846), ('supported.a.01', 3.4222695239650367), ('exculpatory.a.01', 3.2852253208910485), ('profitable.a.01', 3.2123685061965874)]
526
[('absolute.a.01', 3.8376955592856343), ('all.a.01', 2.974639086022629), ('mindful.a.01', 2.937573331456685), ('actual.a.01', 2.8136912482989174), ('exempt.a.01', 2.8114992702960917)]
527
[('future.a.01', 3.1444463015045447), ('potent.a.03', 2.979482810985266), ('related.a.01', 2.790228396619745), ('middle.a.04', 2.74601073174869), ('altered.a.01', 2.7100756079169566)]
528
[('clear.a.04', 4.184316246358996), ('credulous.a.01', 3.786534109066763), ('no.a.01', 3.114634707962715), ('neutral.a.04', 2.973596883579435), ('potent.a.03', 2.8687756664269712)]
529
[('punctual.a.01', 3.5673057746154786), ('aut

[('live.a.01', 2.137882994477971), ('felicitous.a.01', 1.6865130193669173), ('backward.a.01', 1.5577829421488691), ('rural.a.01', 1.4951759872519808), ('limited.a.01', 1.4871716470027931)]
1070
[('efficient.a.01', 1.6626576454172788), ('commercial.a.01', 1.6054412833128198), ('prospective.a.01', 1.3839443573486043), ('shod.a.01', 1.3792844346462223), ('corrigible.a.01', 1.311738335807544)]
1071
[('corrigible.a.01', 2.1401993108096073), ('thinkable.a.01', 1.6840177937731131), ('accessible.a.01', 1.6261548419376086), ('detected.a.01', 1.5226707245059017), ('available.a.01', 1.401518338921775)]
1072
[('middle.a.04', 1.9273732523996083), ('exempt.a.01', 1.7373364519851002), ('demonstrative.a.01', 1.6928662598290491), ('clear.a.01', 1.6171161061064443), ('early.a.01', 1.5042137505443596)]
1073
[('potent.a.03', 1.6587039068705973), ('publicized.a.01', 1.4711803846267513), ('fallible.a.01', 1.4415592562673507), ('immediate.a.03', 1.3642809449938142), ('successful.a.01', 1.335194493843591)]
10

[('private.a.01', 1.382590469450261), ('assisted.a.01', 1.30293082159578), ('enclosed.a.01', 1.2923777920382546), ('union.a.02', 1.2649026089657065), ('occupied.a.01', 1.15590892797811)]
1495
[('expressible.a.01', 1.7971377064787215), ('calculable.a.01', 1.4590912672883554), ('anterior.a.01', 1.4468497670936746), ('responsible.a.01', 1.2864075637987464), ('punctual.a.01', 1.1566141660395162)]
1496
[('free.a.02', 1.4512962125590803), ('anterior.a.01', 1.3177327969042751), ('calculable.a.01', 1.1883873525766173), ('affected.a.01', 1.175844457935989), ('shared.a.01', 0.936485525143396)]
1497
[('credulous.a.01', 1.0601398798588486), ('certain.a.02', 1.0380189857123518), ('safe.a.01', 0.9794254942628287), ('congruous.a.01', 0.9230659048258556), ('private.a.01', 0.8657077741198319)]
1498
[('detected.a.01', 1.37712026286284), ('afraid.a.01', 1.240161658662553), ('proved.a.01', 1.0349267136903322), ('free.a.02', 1.0296926590013098), ('classified.a.01', 1.0111830645659465)]
1499
[('rural.a.01',

[('foreign.a.02', 0.9401959221313005), ('expressible.a.01', 0.9057446857582152), ('changed.a.01', 0.884840837859706), ('fragrant.a.01', 0.8358875196430979), ('anterior.a.01', 0.8121056778267621)]
1877
[('surface.a.01', 0.8999883027520503), ('consecrated.a.01', 0.8794816085973229), ('classified.a.01', 0.8426879497054345), ('neutral.a.04', 0.8083396926206314), ('subsurface.a.01', 0.7463204050592116)]
1878
[('clear.a.11', 0.8842874263249666), ('soluble.a.01', 0.8136675797981452), ('inclined.a.02', 0.7612036311515472), ('substantial.a.03', 0.7591978061474695), ('proved.a.01', 0.7584322992276763)]
1879
[('acknowledged.a.01', 0.9502668444875174), ('live.a.01', 0.748622358896077), ('operative.a.01', 0.7207713071687483), ('committed.a.01', 0.7090508912835087), ('usual.a.01', 0.7051128686932171)]
1880
[('elective.a.01', 1.0279133313116628), ('certain.a.02', 1.008334165253586), ('surface.a.01', 0.8455653261625126), ('subsurface.a.01', 0.8320047331118428), ('free.a.02', 0.6887707593202441)]
1881
