# Look at instances of top terms in corpora

In [19]:
# Load corpora/splits
import pickle
import os

data_dirpath = '/home/mamille3/hegemonic_hate/data'
# splits_fpath = os.path.join(data_dirpath, 'combined_identity_splits_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
split_type = 'power'
splits_fpath = os.path.join(data_dirpath, f'combined_{split_type}_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
with open(splits_fpath, 'rb') as f:
    corpora = pickle.load(f)
    
corpora.keys()

dict_keys([('hegemonic',), ('marginalized',)])

In [20]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

split = ('hegemonic',)
term = 'dress'
data = corpora[split]['all'].query('hate').copy()
data['text_lower'] = data.text.str.lower()
selected = data[data.text_lower.str.contains(r'\b{}\b'.format(term))]
selected.sample(min(len(selected), 20)).loc[:, ['text_lower', 'hate', 'target_groups', 'dataset', 'targetMinority']]

Unnamed: 0_level_0,text_lower,hate,target_groups,dataset,targetMinority
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
39532,how do you get a nun pregnant ? dress her as a little boy and send in a priest,True,[priests],sbic,"[""priests""]"
9131,how do you get a nun to have sex? dress her up as an altar boy.,True,[christians],sbic,"[""christian folks""]"
31598,how do you get a nun pregnant ? dress her up as an altar boy .,True,"[christians, catholics, priests]",sbic,"[""catholics"", ""christian folks"", ""priests""]"
36567,how do you get a nun pregnant? dress her as an alter boy.,True,"[assault victims, christians]",sbic,"[""assault victims"", ""christian folks""]"
31598,how do you get a nun pregnant ? dress her up as an altar boy .,True,"[christians, catholics, priests]",sbic,"[""catholics"", ""christian folks"", ""priests""]"
9128,how do you get a nun pregnant? dress her up as an altar boy,True,"[christians, catholics]",sbic,"[""catholics"", ""christian folks""]"
9131,how do you get a nun to have sex? dress her up as an altar boy.,True,[christians],sbic,"[""christian folks""]"
9130,how do you get a nun pregnant? dress her up like an alter boy,True,[christians],sbic,"[""christian folks""]"
9126,how do you get a nun pregnant dress her up as an altar boy,True,"[assault victims, christians]",sbic,"[""assault victims"", ""christian folks""]"
39532,how do you get a nun pregnant ? dress her as a little boy and send in a priest,True,[priests],sbic,"[""priests""]"


# Prepare corpora for SAGE
Then run it at ~/SAGE/py-sage (repo fork at https://github.com/michaelmilleryoder/SAGE/tree/master/py-sage)  
Take a look at runSage_grid.py, which runs runSage.py over multiple parameter settings

In [12]:
# Load corpora/splits
import pickle
import os
import pandas as pd
from tqdm.notebook import tqdm
import spacy
import re

nlp = spacy.load('en_core_web_sm', exclude=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
nlp.add_pipe('sentencizer')

data_dirpath = '/home/mamille3/hegemonic_hate/data'
# splits_fpath = os.path.join(data_dirpath, 'combined_identity_splits_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')

split_types = ['identities', 'categories', 'power']
for split_type in split_types:
    splits_fpath = os.path.join(data_dirpath, f'combined_{split_type}_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
    with open(splits_fpath, 'rb') as f:
        corpora = pickle.load(f)

    texts = {}

    split_transform = {
        ('race/ethnicity',): 'race/ethnicity',
        ('religion',): 'religion',
        ('gender', 'sexuality'): 'gender/sexuality',
        ('hegemonic',): 'hegemonic',
        ('marginalized',): 'marginalized',
    }

    # for split in list(corpora.keys())[:1]:
    for split in tqdm(corpora):
        data = pd.concat([corpora[split]['train'], corpora[split]['test']])
        hate = data.query('hate')
        texts[split] = hate.text.tolist()

        # Process, save out data
        processed = []
        inp = texts[split]
        # for doc in tqdm(nlp.pipe(inp), total=len(inp)):
        for doc in nlp.pipe(inp):
            sents = [' '.join([tok.text for tok in sent]).strip().lower() for sent in doc.sents]
            processed.extend(sents)

        # Save out
        out_dirpath = os.path.join(data_dirpath, split_type)
        if not os.path.exists(out_dirpath):
            os.mkdir(out_dirpath)
        outpath = os.path.join(out_dirpath, f'{re.sub(r"[ /,]", "_", split_transform.get(split, split[0]))}_sents.txt')
        with open(outpath, 'w') as f:
            for sent in processed:
                f.write(f'{sent}\n')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

# Read, format SAGE output

In [13]:
import os
import pandas as pd
from IPython.display import display

split_types = ['identities', 'categories', 'power']
vocab_sizes = [3000] # [1000, 1500, 2000, 3000, 5000, 10000]
smoothing_rates = [10, 20, 50, 100]
for vocab_size in vocab_sizes:
    print(f'{vocab_size} vocab')
    for smoothing_rate in smoothing_rates:
        print(f'{smoothing_rate} smoothing')
        for split_type in split_types:
            n_words = 10
            sage_outpath = f'/home/mamille3/SAGE/py-sage/output/{split_type}_{n_words}words_{vocab_size}vocab_{smoothing_rate}smoothing.csv'
            results = pd.read_csv(sage_outpath, sep='\t')
            # results['split'] = results.source.str.slice(len(f'input/{split_type}/'), -1*(len('_sents.txt')))
            results['split'] = results.source.map(lambda x: os.path.basename(x)[:-1*(len('_sents.txt'))])
            # results

            pd.set_option('display.max_colwidth', None)
            top_terms = results.groupby('split').agg({'word': ', '.join})
            if split_type == 'identity':
                top_terms.drop('people_of_color', inplace=True)
                top_terms.index = top_terms.index.str.replace('_', ' ')
                top_terms.index = top_terms.index.str.capitalize()
            elif split_type == 'categories':
                top_terms.index = top_terms.index.str.replace('_', '/')
            top_terms.index.name = split_type.capitalize()
            top_terms.columns = ['Top terms']
            if split_type == 'identity':
                top_terms = top_terms.rename(index={'Lgbtq+ people': 'LGBTQ+ people', 'Muslims and arabic middle eastern people': 'Muslims, Arabs'})
            display(top_terms)

3000 vocab
10 smoothing


Unnamed: 0_level_0,Top terms
Identities,Unnamed: 1_level_1
asian_people,"ching, chinese, chong, chinaman, china, asian, japanese, chink, asians, korean"
black_people,"ethiopian, niggas, black, nigga, blm, negro, africa, paint, nfl, nigger"
christians,"priest, catholic, priests, jesus, bible, christianity, catholics, christ, francis, church"
jews,"jewish, chamber, holocaust, oven, gas, zionists, jews, hitler, zionist, jew"
latinx_people,"beaner, latino, latinos, mexico, latin, mexican, spic, mexicans, beaners, puerto"
lgbtq+_people,"transgender, transgendered, transgenderism, trans, transgenders, bisexual, bathrooms, disorder, queers, bathroom"
men,"divorce, dudes, negative, movies, soy, professional, field, bloom, male, men"
muslims_and_arabic_middle_eastern_people,"allahu, akbar, muzrat, islam, islamic, mohammed, muslim, islamophobia, islamist, isis"
white_people,"redneck, supremacists, supremacist, supremacy, white, trailer, shootings, whitey, mudshark, fascist"
women,"hoes, feminism, sexist, hoe, feminist, feminists, chad, woman, slut, tits"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"dyke, hoes, transgender, feminine, trans, dykes, fag, queer, lesbian, locker"
race/ethnicity,"ethiopian, chinese, asian, mexicans, black, hispanic, blacks, asians, nationalists, races"
religion,"priest, catholic, catholics, christians, evangelicals, rabbi, christianity, christian, clergy, bishops"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, priests, catholic, jesus, allcaps, catholics, virgin, bloom, dudes, pope"
marginalized,"muslim, muslims, she, islam, woman, her, nigger, black, jews, women"


20 smoothing


Unnamed: 0_level_0,Top terms
Identities,Unnamed: 1_level_1
asian_people,"chinese, ching, chong, china, asian, chinaman, japanese, asians, chink, japan"
black_people,"niggas, ethiopian, black, nigga, blm, negro, africa, nigger, african, nfl"
christians,"priest, catholic, jesus, priests, bible, christianity, catholics, christians, church, christ"
jews,"jewish, holocaust, jews, chamber, oven, gas, jew, hitler, israel, zionist"
latinx_people,"beaner, latino, latinos, mexico, mexican, spic, mexicans, latin, beaners, hispanic"
lgbtq+_people,"transgender, transgendered, trans, transgenders, bisexual, transgenderism, bathrooms, queers, disorder, bathroom"
men,"divorce, dudes, negative, movies, soy, men, male, professional, bloom, field"
muslims_and_arabic_middle_eastern_people,"islam, islamic, muslim, allahu, akbar, isis, islamophobia, mohammed, muslims, muzrat"
white_people,"redneck, supremacists, supremacist, white, supremacy, mudshark, shootings, fascist, whitey, trailer"
women,"hoes, sexist, feminism, hoe, feminist, feminists, woman, slut, women, chad"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"dyke, hoes, transgender, fag, dykes, lesbian, trans, sexist, queer, hoe"
race/ethnicity,"chinese, ethiopian, asian, black, mexicans, blacks, asians, hispanic, africa, supremacists"
religion,"catholic, priest, catholics, christians, christian, christianity, evangelicals, bishops, rabbi, clergy"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, priests, catholic, jesus, allcaps, catholics, virgin, bloom, dress, devil"
marginalized,"muslim, muslims, islam, she, woman, her, nigger, black, jews, jew"


50 smoothing


Unnamed: 0_level_0,Top terms
Identities,Unnamed: 1_level_1
asian_people,"chinese, china, asian, ching, chong, asians, japanese, chinaman, chink, japan"
black_people,"niggas, black, nigga, nigger, africa, blm, negro, ethiopian, blacks, african"
christians,"priest, catholic, jesus, priests, bible, christians, christianity, christian, church, catholics"
jews,"jewish, jews, holocaust, jew, israel, hitler, gas, oven, zionist, kike"
latinx_people,"latinos, latino, mexico, mexican, mexicans, beaner, spic, latin, hispanic, beaners"
lgbtq+_people,"transgender, transgendered, trans, transgenders, bisexual, queers, bathroom, bathrooms, fag, gay"
men,"divorce, dudes, men, male, negative, movies, man, priests, soy, dad"
muslims_and_arabic_middle_eastern_people,"islam, muslim, islamic, muslims, isis, terrorist, terrorists, iran, bomb, radical"
white_people,"redneck, supremacists, white, supremacist, supremacy, mudshark, trash, fascist, whites, shootings"
women,"hoes, sexist, woman, hoe, feminist, women, feminists, feminism, slut, bitches"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"hoes, dyke, transgender, faggot, fag, sexist, sexual, lesbian, hoe, dykes"
race/ethnicity,"chinese, black, blacks, asian, asians, mexicans, whites, africa, supremacists, supremacist"
religion,"catholic, priest, catholics, christians, christian, christianity, religion, church, jesus, koran"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, allcaps, catholics, virgin, church, devil, dress"
marginalized,"muslim, muslims, she, islam, her, woman, nigger, black, jews, women"


100 smoothing


Unnamed: 0_level_0,Top terms
Identities,Unnamed: 1_level_1
asian_people,"chinese, asian, china, asians, japanese, ching, chong, chink, chinaman, indian"
black_people,"black, niggas, nigga, nigger, africa, blacks, blm, african, negro, lives"
christians,"priest, catholic, jesus, christians, christian, church, bible, christianity, priests, catholics"
jews,"jewish, jews, jew, holocaust, israel, hitler, kike, gas, zionist, oven"
latinx_people,"mexican, mexicans, latinos, latino, mexico, spic, hispanic, beaner, hispanics, latin"
lgbtq+_people,"transgender, trans, transgendered, transgenders, gay, faggot, faggots, bisexual, queers, fag"
men,"men, male, man, divorce, negative, dudes, boys, priests, dad, movies"
muslims_and_arabic_middle_eastern_people,"muslim, islam, islamic, muslims, terrorist, isis, terrorists, bomb, iran, radical"
white_people,"redneck, supremacists, white, supremacist, trash, mudshark, whites, supremacy, nazi, nazis"
women,"hoes, woman, women, sexist, hoe, feminist, bitch, bitches, her, feminists"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"hoes, faggot, dyke, woman, transgender, fag, sexual, sexist, women, female"
race/ethnicity,"black, blacks, chinese, whites, asian, asians, white, racism, africa, racist"
religion,"catholic, christians, christian, priest, catholics, christianity, religion, church, islam, jesus"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, allcaps, catholics, church, white, supremacists, virgin"
marginalized,"muslim, muslims, she, black, her, women, woman, nigger, in, to"


In [21]:
# Just look at an individual run's output
# Format for LaTeX

import os
import pandas as pd
from IPython.display import display

split_types = ['identities', 'categories', 'power']
vocab_size = 3000
n_words = 10
smoothing = 50
for split_type in split_types:
    sage_outpath = f'/home/mamille3/SAGE/py-sage/output/{split_type}_{n_words}words_{vocab_size}vocab_{smoothing}smoothing.csv'
    results = pd.read_csv(sage_outpath, sep='\t')
    # results['split'] = results.source.str.slice(len(f'input/{split_type}/'), -1*(len('_sents.txt')))
    results['split'] = results.source.map(lambda x: os.path.basename(x)[:-1*(len('_sents.txt'))])
    # results

    pd.set_option('display.max_colwidth', None)
    top_terms = results.groupby('split').agg({'word': ', '.join})
    if split_type == 'identities':
        if 'people of color' in top_terms.index:
            top_terms.drop('people_of_color', inplace=True)
        top_terms.index = top_terms.index.str.replace('_', ' ').str.replace(' people', '')
        top_terms.index = top_terms.index.str.capitalize()
    elif split_type == 'categories':
        top_terms.index = top_terms.index.str.replace('_', '/')
    top_terms.index.name = split_type.capitalize()
    top_terms.columns = ['Top terms']
    if split_type == 'identities':
        top_terms = top_terms.rename(index={'Lgbtq+': 'LGBTQ+', 'Muslims and arabic middle eastern': 'Muslims, Arabs'})
    display(top_terms)

    latex = top_terms.style.to_latex(hrules=True, label=f'{split_type}_sage', caption=f'Most representative terms in corpora divided by target identity {split_type} from SAGE', environment='table*')
    latex = latex.replace(' & Top terms', f'{split_type.capitalize()} & Top terms').replace(f'{split_type.capitalize()} &  \\\\\n', '').replace(' people', '').replace('hegemonic', 'dominant')
    print(latex)

Unnamed: 0_level_0,Top terms
Identities,Unnamed: 1_level_1
Asian,"chinese, china, asian, ching, chong, asians, japanese, chinaman, chink, japan"
Black,"niggas, black, nigga, nigger, africa, blm, negro, ethiopian, blacks, african"
Christians,"priest, catholic, jesus, priests, bible, christians, christianity, christian, church, catholics"
Jews,"jewish, jews, holocaust, jew, israel, hitler, gas, oven, zionist, kike"
Latinx,"latinos, latino, mexico, mexican, mexicans, beaner, spic, latin, hispanic, beaners"
LGBTQ+,"transgender, transgendered, trans, transgenders, bisexual, queers, bathroom, bathrooms, fag, gay"
Men,"divorce, dudes, men, male, negative, movies, man, priests, soy, dad"
"Muslims, Arabs","islam, muslim, islamic, muslims, isis, terrorist, terrorists, iran, bomb, radical"
White,"redneck, supremacists, white, supremacist, supremacy, mudshark, trash, fascist, whites, shootings"
Women,"hoes, sexist, woman, hoe, feminist, women, feminists, feminism, slut, bitches"


\begin{table*}
\caption{Most representative terms in corpora divided by target identity identities from SAGE}
\label{identities_sage}
\begin{tabular}{ll}
\toprule
Identities & Top terms \\
\midrule
Asian & chinese, china, asian, ching, chong, asians, japanese, chinaman, chink, japan \\
Black & niggas, black, nigga, nigger, africa, blm, negro, ethiopian, blacks, african \\
Christians & priest, catholic, jesus, priests, bible, christians, christianity, christian, church, catholics \\
Jews & jewish, jews, holocaust, jew, israel, hitler, gas, oven, zionist, kike \\
Latinx & latinos, latino, mexico, mexican, mexicans, beaner, spic, latin, hispanic, beaners \\
LGBTQ+ & transgender, transgendered, trans, transgenders, bisexual, queers, bathroom, bathrooms, fag, gay \\
Men & divorce, dudes, men, male, negative, movies, man, priests, soy, dad \\
Muslims, Arabs & islam, muslim, islamic, muslims, isis, terrorist, terrorists, iran, bomb, radical \\
White & redneck, supremacists, white, supremacist

Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"hoes, dyke, transgender, faggot, fag, sexist, sexual, lesbian, hoe, dykes"
race/ethnicity,"chinese, black, blacks, asian, asians, mexicans, whites, africa, supremacists, supremacist"
religion,"catholic, priest, catholics, christians, christian, christianity, religion, church, jesus, koran"


\begin{table*}
\caption{Most representative terms in corpora divided by target identity categories from SAGE}
\label{categories_sage}
\begin{tabular}{ll}
\toprule
Categories & Top terms \\
\midrule
gender/sexuality & hoes, dyke, transgender, faggot, fag, sexist, sexual, lesbian, hoe, dykes \\
race/ethnicity & chinese, black, blacks, asian, asians, mexicans, whites, africa, supremacists, supremacist \\
religion & catholic, priest, catholics, christians, christian, christianity, religion, church, jesus, koran \\
\bottomrule
\end{tabular}
\end{table*}



Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, allcaps, catholics, virgin, church, devil, dress"
marginalized,"muslim, muslims, she, islam, her, woman, nigger, black, jews, women"


\begin{table*}
\caption{Most representative terms in corpora divided by target identity power from SAGE}
\label{power_sage}
\begin{tabular}{ll}
\toprule
Power & Top terms \\
\midrule
dominant & priest, catholic, priests, jesus, allcaps, catholics, virgin, church, devil, dress \\
marginalized & muslim, muslims, she, islam, her, woman, nigger, black, jews, women \\
\bottomrule
\end{tabular}
\end{table*}



# Run SAGE with different settings
Tried to run it directly from Python script instead of through a shell command, but it's Python 2.7

In [1]:
import os
from tqdm.notebook import tqdm
os.chdir('/home/mamille3/SAGE/py-sage/')
from runSage import runSage

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(s.its,'/',s.max_its,change)? (deltaIterator.py, line 16)

In [None]:
vocab_settings = [1500, 3000, 5000]
for vocab_setting in tqdm(vocab_settings):
    cmd = 