# Look at instances of top terms in corpora

In [21]:
# Load corpora/splits
import pickle
import os

data_dirpath = '/home/mamille3/hegemonic_hate/data'
# splits_fpath = os.path.join(data_dirpath, 'combined_identity_splits_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
split_type = 'power'
splits_fpath = os.path.join(data_dirpath, f'combined_{split_type}_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
with open(splits_fpath, 'rb') as f:
    corpora = pickle.load(f)
    
corpora.keys()

dict_keys([('hegemonic',), ('marginalized',)])

In [22]:
import pandas as pd

split = ('hegemonic',)
term = 'virgin'
data = pd.concat([corpora[split]['train'], corpora[split]['test']]).query('hate')
data['text_lower'] = data.text.str.lower()
selected = data[data.text_lower.str.contains(r'\b{}\b'.format(term))]
selected.sample(min(len(selected), 20)).loc[:, ['text_lower', 'hate']]

Unnamed: 0_level_0,text_lower,hate
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6330793,"using mary, if such a human actually existed, and the having the idiocy of attempting altering biblical scripture to fit a narrative to protect and excuse a pedophile shows the level of ignorance of evangelicals, in a state that has the reputation of brothers marrying their sisters. \n\nmary was, per the bible, a virgin, meaning ""untouched by any man."" moore is a sick, twisted, perverted, liar, choosing the wrong time to not admit, to not step aside, to not demand a full investigation (which the innocent usually demand and deserve), and foolishly to use the donald ""sexual predator"" failing defense. wrong time, wrong place, wrong moment in history, because this is the moment, after decades of bullying, death threats, fear of reprisals such as jobloss, and more, women are coming forward, again after and i reiterate, decades of forced silence about abuse.\n\nanyone, male, female, christian, not christian, who stands with roy moore,is standing on the wrong side of history.",True
6268,did jesus die a virgin? of course not! he got nailed before he died.,True
25480,what do you call a white girl who runs faster than her brothers? a redneck virgin.,True
25479,what do you call a white girl that can run faster than her brothers? a redneck virgin,True
13753,you sound like that fattest fucking virgin i've ever seen. your cringe as fuck and will stay alone forever with your 4 inch chode that busts within 2 minutes of wanking with your 2 fingers.,True
43308,what do you call a white girl who’s faster than her brothers? a redneck virgin,True
34432,what do you call a white girl who can run faster than her brothers ? a redneck virgin .,True
27727,@faithfulgadolic @thehill fuck jesus. fuck the virgin mother. fuck the holy ghost. fuck your bronze age mumbo jumbo. you sad sack of shit.,True
9672,"ah, so now we're playing the repetition game, ay? you can repeat my ass when you're done with me, you antifa sympathizing, communist stalin-loving disgusting toe gargling overweight whore housing horribly obese neckbeard wielding fedora wearing card collecting pink-haired slut loving, virgin little cunt.",True
43308,what do you call a white girl who’s faster than her brothers? a redneck virgin,True


# Prepare corpora for SAGE
Then run it at ~/SAGE/py-sage (repo fork at https://github.com/michaelmilleryoder/SAGE/tree/master/py-sage)  
Take a look at runSage_grid.py, which runs runSage.py over multiple parameter settings

In [11]:
# Load corpora/splits
import pickle
import os
import pandas as pd
from tqdm.notebook import tqdm
import spacy
import re

nlp = spacy.load('en_core_web_sm', exclude=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
nlp.add_pipe('sentencizer')

data_dirpath = '/home/mamille3/hegemonic_hate/data'
# splits_fpath = os.path.join(data_dirpath, 'combined_identity_splits_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')

split_types = ['identity', 'categories', 'power']
for split_type in split_types:
    splits_fpath = os.path.join(data_dirpath, f'combined_{split_type}_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
    with open(splits_fpath, 'rb') as f:
        corpora = pickle.load(f)

    texts = {}

    split_transform = {
        ('race/ethnicity',): 'race/ethnicity',
        ('religion',): 'religion',
        ('gender', 'sexuality'): 'gender/sexuality',
        ('hegemonic',): 'hegemonic',
        ('marginalized',): 'marginalized',
    }

    # for split in list(corpora.keys())[:1]:
    for split in tqdm(corpora):
        data = pd.concat([corpora[split]['train'], corpora[split]['test']])
        hate = data.query('hate')
        texts[split] = hate.text.tolist()

        # Process, save out data
        processed = []
        inp = texts[split]
        # for doc in tqdm(nlp.pipe(inp), total=len(inp)):
        for doc in nlp.pipe(inp):
            sents = [' '.join([tok.text for tok in sent]).strip().lower() for sent in doc.sents]
            processed.extend(sents)

        # Save out
        out_dirpath = os.path.join(data_dirpath, split_type)
        if not os.path.exists(out_dirpath):
            os.mkdir(out_dirpath)
        outpath = os.path.join(out_dirpath, f'{re.sub(r"[ /,]", "_", split_transform.get(split, split))}_sents.txt')
        with open(outpath, 'w') as f:
            for sent in processed:
                f.write(f'{sent}\n')

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

# Read, format SAGE output

In [10]:
import os
import pandas as pd
from IPython.display import display

split_types = ['identity', 'categories', 'power']
vocab_sizes = [3000] # [1000, 1500, 2000, 3000, 5000, 10000]
smoothing_rates = [10, 20, 50, 100]
for vocab_size in vocab_sizes:
    print(f'{vocab_size} vocab')
    for smoothing_rate in smoothing_rates:
        print(f'{smoothing_rate} smoothing')
        for split_type in split_types:
            n_words = 10
            sage_outpath = f'/home/mamille3/SAGE/py-sage/output/{split_type}_{n_words}words_{vocab_size}vocab_{smoothing_rate}smoothing.csv'
            results = pd.read_csv(sage_outpath, sep='\t')
            # results['split'] = results.source.str.slice(len(f'input/{split_type}/'), -1*(len('_sents.txt')))
            results['split'] = results.source.map(lambda x: os.path.basename(x)[:-1*(len('_sents.txt'))])
            # results

            pd.set_option('display.max_colwidth', None)
            top_terms = results.groupby('split').agg({'word': ', '.join})
            if split_type == 'identity':
                top_terms.drop('people_of_color', inplace=True)
                top_terms.index = top_terms.index.str.replace('_', ' ')
                top_terms.index = top_terms.index.str.capitalize()
            elif split_type == 'categories':
                top_terms.index = top_terms.index.str.replace('_', '/')
            top_terms.index.name = split_type.capitalize()
            top_terms.columns = ['Top terms']
            if split_type == 'identity':
                top_terms = top_terms.rename(index={'Lgbtq+ people': 'LGBTQ+ people', 'Muslims and arabic middle eastern people': 'Muslims, Arabs'})
            display(top_terms)

3000 vocab
10 smoothing


Unnamed: 0_level_0,Top terms
Identity,Unnamed: 1_level_1
Asian people,"chinese, ching, chong, chinaman, china, asian, korean, japan, japanese, chink"
Black people,"niggas, black, nigga, africa, nfl, cotton, unarmed, negro, cop, nig"
Christians,"priest, vatican, christianity, jesus, catholic, catholics, bible, christians, christian, christ"
Jews,"zionists, jewish, zionist, oven, israeli, gas, palestine, jews, holocaust, israel"
Latinx people,"beaner, latino, mexico, latinos, spic, latin, mexicans, mexican, beaners, wetback"
LGBTQ+ people,"homosexual, homosexuality, gay, homosexuals, transgender, queers, transgendered, orientation, fag, queer"
Men,"molotov, male, divorce, penis, dollar, men, incel, soy, genders, truck"
"Muslims, Arabs","allahu, akbar, mohammed, islamist, islamic, quran, islam, muslim, arabic, jihadi"
White people,"redneck, supremacist, nationalists, trailer, supremacists, white, brothers, nationalism, nazi, devil"
Women,"feminism, hoes, sexist, feminists, chad, feminist, sluts, hoe, woman, women"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"dyke, transgender, queer, bathroom, queers, lesbian, fag, feminine, gender, trans"
race/ethnicity,"black, blacks, ethiopian, chinese, cotton, asian, blm, chicago, neighborhoods, asians"
religion,"priest, catholic, catholics, evangelicals, chimney, christians, vatican, ash, auschwitz, chamber"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, catholics, bible, supremacists, christian, virgin, church"
marginalized,"black, nigger, muslims, muslim, blacks, woman, she, niggas, her, islam"


20 smoothing


Unnamed: 0_level_0,Top terms
Identity,Unnamed: 1_level_1
Asian people,"chinese, ching, chong, china, chinaman, asian, japanese, japan, korean, chink"
Black people,"niggas, black, nigga, africa, negro, nfl, cop, unarmed, chicago, cops"
Christians,"priest, christianity, vatican, jesus, catholic, catholics, bible, christians, christian, christ"
Jews,"jewish, zionists, jews, zionist, gas, holocaust, oven, israel, jew, israeli"
Latinx people,"beaner, latino, mexico, latinos, spic, mexicans, mexican, latin, beaners, hispanic"
LGBTQ+ people,"homosexual, gay, homosexuality, transgender, homosexuals, queers, fag, faggots, transgendered, queer"
Men,"molotov, male, penis, divorce, men, dollar, man, incel, soy, genders"
"Muslims, Arabs","allahu, akbar, islamic, muslim, islam, mohammed, isis, islamist, terrorist, quran"
White people,"redneck, supremacist, white, nationalists, supremacists, trailer, nazi, brothers, devil, nationalism"
Women,"hoes, sexist, feminism, feminists, feminist, hoe, chad, woman, sluts, women"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"dyke, transgender, queer, fag, queers, lesbian, gender, bathroom, sexist, trans"
race/ethnicity,"black, blacks, chinese, asian, blm, chicago, asians, ethiopian, whites, africa"
religion,"priest, catholic, catholics, christians, evangelicals, chimney, ash, christian, christianity, vatican"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, catholics, bible, supremacists, christian, church, christ"
marginalized,"black, nigger, muslims, muslim, blacks, niggas, woman, islam, she, her"


50 smoothing


Unnamed: 0_level_0,Top terms
Identity,Unnamed: 1_level_1
Asian people,"chinese, china, asian, ching, chong, chinaman, japanese, asians, japan, chink"
Black people,"niggas, black, nigga, africa, nigger, cop, negro, blacks, cops, nfl"
Christians,"priest, jesus, christianity, catholic, catholics, christians, bible, christian, vatican, christ"
Jews,"jewish, jews, jew, israel, holocaust, gas, zionist, zionists, hitler, oven"
Latinx people,"latino, latinos, mexico, beaner, mexicans, mexican, spic, hispanic, latin, hispanics"
LGBTQ+ people,"gay, homosexual, transgender, homosexuals, homosexuality, queers, fag, faggots, faggot, gays"
Men,"male, men, man, penis, molotov, dollar, gender, males, divorce, incel"
"Muslims, Arabs","muslim, islam, islamic, allahu, muslims, terrorist, isis, akbar, bomb, iran"
White people,"redneck, supremacist, white, supremacists, nazi, nationalists, devil, brothers, trash, virgin"
Women,"hoes, sexist, feminists, woman, women, feminist, feminism, hoe, slut, female"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"dyke, transgender, fag, gender, sexist, queer, queers, lesbian, faggot, hoes"
race/ethnicity,"black, blacks, chinese, whites, asian, asians, africa, nigga, blm, niggas"
religion,"catholic, priest, catholics, christians, christian, christianity, jesus, ash, evangelicals, chamber"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, catholics, bible, supremacists, christian, church, christ"
marginalized,"black, nigger, muslims, muslim, blacks, woman, she, her, islam, niggas"


100 smoothing


Unnamed: 0_level_0,Top terms
Identity,Unnamed: 1_level_1
Asian people,"chinese, china, asian, asians, ching, chong, japanese, chinaman, chink, japan"
Black people,"black, niggas, nigga, africa, nigger, blacks, cop, cops, negro, african"
Christians,"priest, jesus, catholic, christianity, christians, christian, catholics, bible, christ, vatican"
Jews,"jewish, jews, jew, israel, hitler, holocaust, gas, kike, zionist, oven"
Latinx people,"latinos, latino, mexico, mexican, mexicans, spic, beaner, hispanic, hispanics, latin"
LGBTQ+ people,"gay, homosexual, transgender, homosexuals, faggot, faggots, queers, gays, fag, homosexuality"
Men,"male, men, man, penis, gender, males, dollar, trans, boys, tranny"
"Muslims, Arabs","muslim, islam, islamic, muslims, terrorist, isis, terrorists, bomb, arab, iran"
White people,"redneck, supremacist, white, supremacists, nazi, trash, nationalists, devil, mudshark, brothers"
Women,"hoes, women, woman, sexist, feminists, feminist, hoe, female, her, slut"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"dyke, transgender, faggot, fag, gender, sexist, sexual, hoes, female, faggots"
race/ethnicity,"black, blacks, whites, nigga, niggas, asians, nigger, chinese, africa, racist"
religion,"catholic, priest, christians, catholics, christian, christianity, jesus, islam, jewish, islamic"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, supremacists, christian, bible, catholics, church, boys"
marginalized,"black, nigger, muslims, muslim, blacks, she, her, woman, women, my"


In [13]:
# Just look at an individual run's output

import os
import pandas as pd
from IPython.display import display

split_types = ['identity', 'categories', 'power']
vocab_size = 3000
n_words = 10
smoothing = 10
for split_type in split_types:
    sage_outpath = f'/home/mamille3/SAGE/py-sage/output/{split_type}_{n_words}words_{vocab_size}vocab_{smoothing}smoothing.csv'
    results = pd.read_csv(sage_outpath, sep='\t')
    # results['split'] = results.source.str.slice(len(f'input/{split_type}/'), -1*(len('_sents.txt')))
    results['split'] = results.source.map(lambda x: os.path.basename(x)[:-1*(len('_sents.txt'))])
    # results

    pd.set_option('display.max_colwidth', None)
    top_terms = results.groupby('split').agg({'word': ', '.join})
    if split_type == 'identity':
        top_terms.drop('people_of_color', inplace=True)
        top_terms.index = top_terms.index.str.replace('_', ' ')
        top_terms.index = top_terms.index.str.capitalize()
    elif split_type == 'categories':
        top_terms.index = top_terms.index.str.replace('_', '/')
    top_terms.index.name = split_type.capitalize()
    top_terms.columns = ['Top terms']
    if split_type == 'identity':
        top_terms = top_terms.rename(index={'Lgbtq+ people': 'LGBTQ+ people', 'Muslims and arabic middle eastern people': 'Muslims, Arabs'})
    display(top_terms)

    latex = top_terms.style.to_latex(hrules=True, label=f'{split_type}_sage', caption=f'Most representative terms in corpora divided by target identity {split_type} from SAGE', environment='table*')
    latex = latex.replace(' & Top terms', f'{split_type.capitalize()} & Top terms').replace(f'{split_type.capitalize()} &  \\\\\n', '').replace(' people', '').replace('hegemonic', 'dominant')
    print(latex)

Unnamed: 0_level_0,Top terms
Identity,Unnamed: 1_level_1
Asian people,"chinese, ching, chong, chinaman, china, asian, korean, japan, japanese, chink"
Black people,"niggas, black, nigga, africa, nfl, cotton, unarmed, negro, cop, nig"
Christians,"priest, vatican, christianity, jesus, catholic, catholics, bible, christians, christian, christ"
Jews,"zionists, jewish, zionist, oven, israeli, gas, palestine, jews, holocaust, israel"
Latinx people,"beaner, latino, mexico, latinos, spic, latin, mexicans, mexican, beaners, wetback"
LGBTQ+ people,"homosexual, homosexuality, gay, homosexuals, transgender, queers, transgendered, orientation, fag, queer"
Men,"molotov, male, divorce, penis, dollar, men, incel, soy, genders, truck"
"Muslims, Arabs","allahu, akbar, mohammed, islamist, islamic, quran, islam, muslim, arabic, jihadi"
White people,"redneck, supremacist, nationalists, trailer, supremacists, white, brothers, nationalism, nazi, devil"
Women,"feminism, hoes, sexist, feminists, chad, feminist, sluts, hoe, woman, women"


\begin{table*}
\caption{Most representative terms in corpora divided by target identity identity from SAGE}
\label{identity_sage}
\begin{tabular}{ll}
\toprule
Identity & Top terms \\
\midrule
Asian & chinese, ching, chong, chinaman, china, asian, korean, japan, japanese, chink \\
Black & niggas, black, nigga, africa, nfl, cotton, unarmed, negro, cop, nig \\
Christians & priest, vatican, christianity, jesus, catholic, catholics, bible, christians, christian, christ \\
Jews & zionists, jewish, zionist, oven, israeli, gas, palestine, jews, holocaust, israel \\
Latinx & beaner, latino, mexico, latinos, spic, latin, mexicans, mexican, beaners, wetback \\
LGBTQ+ & homosexual, homosexuality, gay, homosexuals, transgender, queers, transgendered, orientation, fag, queer \\
Men & molotov, male, divorce, penis, dollar, men, incel, soy, genders, truck \\
Muslims, Arabs & allahu, akbar, mohammed, islamist, islamic, quran, islam, muslim, arabic, jihadi \\
White & redneck, supremacist, nationalists, 

Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"dyke, transgender, queer, bathroom, queers, lesbian, fag, feminine, gender, trans"
race/ethnicity,"black, blacks, ethiopian, chinese, cotton, asian, blm, chicago, neighborhoods, asians"
religion,"priest, catholic, catholics, evangelicals, chimney, christians, vatican, ash, auschwitz, chamber"


\begin{table*}
\caption{Most representative terms in corpora divided by target identity categories from SAGE}
\label{categories_sage}
\begin{tabular}{ll}
\toprule
Categories & Top terms \\
\midrule
gender/sexuality & dyke, transgender, queer, bathroom, queers, lesbian, fag, feminine, gender, trans \\
race/ethnicity & black, blacks, ethiopian, chinese, cotton, asian, blm, chicago, neighborhoods, asians \\
religion & priest, catholic, catholics, evangelicals, chimney, christians, vatican, ash, auschwitz, chamber \\
\bottomrule
\end{tabular}
\end{table*}



Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, catholics, bible, supremacists, christian, virgin, church"
marginalized,"black, nigger, muslims, muslim, blacks, woman, she, niggas, her, islam"


\begin{table*}
\caption{Most representative terms in corpora divided by target identity power from SAGE}
\label{power_sage}
\begin{tabular}{ll}
\toprule
Power & Top terms \\
\midrule
dominant & priest, catholic, priests, jesus, catholics, bible, supremacists, christian, virgin, church \\
marginalized & black, nigger, muslims, muslim, blacks, woman, she, niggas, her, islam \\
\bottomrule
\end{tabular}
\end{table*}



# Run SAGE with different settings
Tried to run it directly from Python script instead of through a shell command, but it's Python 2.7

In [1]:
import os
from tqdm.notebook import tqdm
os.chdir('/home/mamille3/SAGE/py-sage/')
from runSage import runSage

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(s.its,'/',s.max_its,change)? (deltaIterator.py, line 16)

In [None]:
vocab_settings = [1500, 3000, 5000]
for vocab_setting in tqdm(vocab_settings):
    cmd = 