# Look at instances of top terms in corpora

In [16]:
# Load corpora/splits
import pickle
import os

data_dirpath = '/home/mamille3/hegemonic_hate/data'
# splits_fpath = os.path.join(data_dirpath, 'combined_identity_splits_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
split_type = 'identities'
splits_fpath = os.path.join(data_dirpath, f'combined_{split_type}_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
with open(splits_fpath, 'rb') as f:
    corpora = pickle.load(f)
    
corpora.keys()

dict_keys([('men',), ('christians',), ('black people',), ('jews',), ('asian people',), ('women',), ('lgbtq+ people',), ('muslims and arabic/middle eastern people',), ('white people',), ('latinx people',)])

In [21]:
import pandas as pd

split = ('men',)
term = 'title'
data = pd.concat([corpora[split]['train'], corpora[split]['test']]).query('hate')
data['text_lower'] = data.text.str.lower()
selected = data[data.text_lower.str.contains(r'\b{}\b'.format(term))]
selected.sample(min(len(selected), 20)).loc[:, ['text_lower', 'hate', 'target_groups']]

Unnamed: 0_level_0,text_lower,hate,target_groups
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30222,"you may think this title contains the word blаck, but it actually does not. autofag begone.",True,"[gay men, black people, transgender men, bisexual men]"
811644,some women do not have the aptitude to do a man's job.\n\ndoesn't sound fair? neither was your title.,True,"[men, women]"
30222,"you may think this title contains the word blаck, but it actually does not. autofag begone.",True,"[gay men, black people, transgender men, bisexual men]"
5931214,feinstein doesn't like catholics.\n\nhillary thinks president trump and all of his supporters are deplorable white nationalists.\n\nmaxine waters wants to impeach president trump and believes that his supporters are out to kill her because they are all white nationalists.\n\nchuck schumer still wants to take everyone's guns away as well as all politically-incorrect art/war memorials/statues.\n\ndemocratic sanctuary cities and many progressive judges don't believe that they have to follow federal immigration laws.\n\nstate democratic ags want to sue the department of education because secretary de vos is reviewing title ix policies which deny due process. they believe as most college administrators do that any man accused of sexual assault is always guilty.\n\nway to go democrats! please keep up your non-stop racist identity politics and indulgent victimhood! you are doing an excellent job!\n\nin 2020 maxine waters for president and chuck schumer for vice-president.\n\nhow could they possibly lose?,True,"[men, christians, white people]"
30222,"you may think this title contains the word blаck, but it actually does not. autofag begone.",True,"[gay men, black people, transgender men, bisexual men]"
7377,why do you fucking retards care his title is off one letter. eat cocks lmao. bunch of lame neckbeard chubby fucks,True,[men]
30222,"you may think this title contains the word blаck, but it actually does not. autofag begone.",True,"[gay men, black people, transgender men, bisexual men]"


# Prepare corpora for SAGE
Then run it at ~/SAGE/py-sage (repo fork at https://github.com/michaelmilleryoder/SAGE/tree/master/py-sage)  
Take a look at runSage_grid.py, which runs runSage.py over multiple parameter settings

In [5]:
# Load corpora/splits
import pickle
import os
import pandas as pd
from tqdm.notebook import tqdm
import spacy
import re

nlp = spacy.load('en_core_web_sm', exclude=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
nlp.add_pipe('sentencizer')

data_dirpath = '/home/mamille3/hegemonic_hate/data'
# splits_fpath = os.path.join(data_dirpath, 'combined_identity_splits_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')

split_types = ['identities', 'categories', 'power']
for split_type in split_types:
    splits_fpath = os.path.join(data_dirpath, f'combined_{split_type}_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
    with open(splits_fpath, 'rb') as f:
        corpora = pickle.load(f)

    texts = {}

    split_transform = {
        ('race/ethnicity',): 'race/ethnicity',
        ('religion',): 'religion',
        ('gender', 'sexuality'): 'gender/sexuality',
        ('hegemonic',): 'hegemonic',
        ('marginalized',): 'marginalized',
    }

    # for split in list(corpora.keys())[:1]:
    for split in tqdm(corpora):
        data = pd.concat([corpora[split]['train'], corpora[split]['test']])
        hate = data.query('hate')
        texts[split] = hate.text.tolist()

        # Process, save out data
        processed = []
        inp = texts[split]
        # for doc in tqdm(nlp.pipe(inp), total=len(inp)):
        for doc in nlp.pipe(inp):
            sents = [' '.join([tok.text for tok in sent]).strip().lower() for sent in doc.sents]
            processed.extend(sents)

        # Save out
        out_dirpath = os.path.join(data_dirpath, split_type)
        if not os.path.exists(out_dirpath):
            os.mkdir(out_dirpath)
        outpath = os.path.join(out_dirpath, f'{re.sub(r"[ /,]", "_", split_transform.get(split, split[0]))}_sents.txt')
        with open(outpath, 'w') as f:
            for sent in processed:
                f.write(f'{sent}\n')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

# Read, format SAGE output

In [7]:
import os
import pandas as pd
from IPython.display import display

split_types = ['identities', 'categories', 'power']
vocab_sizes = [3000] # [1000, 1500, 2000, 3000, 5000, 10000]
smoothing_rates = [10, 20, 50, 100]
for vocab_size in vocab_sizes:
    print(f'{vocab_size} vocab')
    for smoothing_rate in smoothing_rates:
        print(f'{smoothing_rate} smoothing')
        for split_type in split_types:
            n_words = 10
            sage_outpath = f'/home/mamille3/SAGE/py-sage/output/{split_type}_{n_words}words_{vocab_size}vocab_{smoothing_rate}smoothing.csv'
            results = pd.read_csv(sage_outpath, sep='\t')
            # results['split'] = results.source.str.slice(len(f'input/{split_type}/'), -1*(len('_sents.txt')))
            results['split'] = results.source.map(lambda x: os.path.basename(x)[:-1*(len('_sents.txt'))])
            # results

            pd.set_option('display.max_colwidth', None)
            top_terms = results.groupby('split').agg({'word': ', '.join})
            if split_type == 'identity':
                top_terms.drop('people_of_color', inplace=True)
                top_terms.index = top_terms.index.str.replace('_', ' ')
                top_terms.index = top_terms.index.str.capitalize()
            elif split_type == 'categories':
                top_terms.index = top_terms.index.str.replace('_', '/')
            top_terms.index.name = split_type.capitalize()
            top_terms.columns = ['Top terms']
            if split_type == 'identity':
                top_terms = top_terms.rename(index={'Lgbtq+ people': 'LGBTQ+ people', 'Muslims and arabic middle eastern people': 'Muslims, Arabs'})
            display(top_terms)

3000 vocab
10 smoothing


Unnamed: 0_level_0,Top terms
Identities,Unnamed: 1_level_1
asian_people,"chinese, ching, chong, chinaman, china, asian, japanese, japan, chink, asians"
black_people,"ethiopian, niggas, black, nigga, blm, negro, chicago, nigger, nfl, tree"
christians,"priest, catholic, bible, priests, vatican, christian, christianity, jesus, catholics, christians"
jews,"jewish, oven, holocaust, zionists, gas, chamber, jew, jews, hitler, israeli"
latinx_people,"latino, beaner, latinos, mexico, mexicans, mexican, spic, latin, beaners, puerto"
lgbtq+_people,"transgender, transgendered, transgenderism, trans, bisexual, disorder, transgenders, queers, bathrooms, bi"
men,"hunt, negative, boys, despicable, relationships, men, title, males, man, easier"
muslims_and_arabic_middle_eastern_people,"allahu, akbar, mohammed, islam, islamic, muslim, islamist, saudi, arabia, islamophobia"
white_people,"redneck, supremacists, snow, supremacist, jack, white, devil, fascist, rage, mail"
women,"hoes, sexist, feminists, feminist, feminism, hoe, chad, sluts, slut, woman"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"dyke, hoes, transgender, feminine, trans, dykes, fag, queer, lesbian, locker"
race/ethnicity,"ethiopian, chinese, asian, mexicans, black, hispanic, blacks, asians, nationalists, races"
religion,"priest, catholic, catholics, christians, evangelicals, rabbi, christianity, christian, clergy, bishops"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, allcaps, christianity, virgin, dudes, pope, bishops"
marginalized,"muslims, muslim, woman, she, nigger, black, women, her, islam, jews"


20 smoothing


Unnamed: 0_level_0,Top terms
Identities,Unnamed: 1_level_1
asian_people,"chinese, ching, chong, china, chinaman, asian, japanese, asians, japan, chink"
black_people,"niggas, black, nigga, ethiopian, blm, negro, chicago, nigger, africa, nfl"
christians,"priest, catholic, bible, priests, christian, vatican, jesus, christianity, christians, catholics"
jews,"jewish, oven, holocaust, gas, jew, jews, hitler, zionists, chamber, zionist"
latinx_people,"latino, latinos, beaner, mexico, mexicans, mexican, spic, latin, beaners, hispanic"
lgbtq+_people,"transgender, transgendered, trans, transgenderism, bisexual, transgenders, disorder, queers, bathrooms, bathroom"
men,"hunt, negative, boys, men, man, males, despicable, relationships, title, easier"
muslims_and_arabic_middle_eastern_people,"islam, akbar, allahu, muslim, islamic, mohammed, saudi, terrorist, muslims, isis"
white_people,"redneck, supremacists, snow, supremacist, white, jack, devil, fascist, trash, rage"
women,"hoes, sexist, feminists, feminist, hoe, feminism, woman, slut, women, bitches"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"dyke, hoes, transgender, fag, dykes, lesbian, trans, sexist, queer, hoe"
race/ethnicity,"chinese, ethiopian, asian, black, mexicans, blacks, asians, hispanic, africa, supremacists"
religion,"catholic, priest, catholics, christians, christian, christianity, evangelicals, bishops, rabbi, clergy"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, allcaps, christianity, virgin, christian, supremacist, pope"
marginalized,"muslims, muslim, woman, she, nigger, islam, black, her, women, jews"


50 smoothing


Unnamed: 0_level_0,Top terms
Identities,Unnamed: 1_level_1
asian_people,"chinese, asian, china, ching, chong, japanese, asians, chinaman, chink, japan"
black_people,"niggas, black, nigga, nigger, blm, negro, blacks, africa, african, ethiopian"
christians,"priest, catholic, bible, christian, priests, jesus, christians, christianity, catholics, pope"
jews,"jewish, jew, jews, holocaust, gas, hitler, oven, israel, kike, zionist"
latinx_people,"latino, latinos, mexicans, mexican, mexico, spic, beaner, hispanic, latin, beaners"
lgbtq+_people,"transgender, transgendered, trans, transgenders, bisexual, queers, disorder, bathroom, gay, queer"
men,"boys, men, man, negative, hunt, males, male, despicable, relationships, easier"
muslims_and_arabic_middle_eastern_people,"islam, muslim, islamic, muslims, terrorist, isis, saudi, akbar, terrorists, arabia"
white_people,"redneck, supremacists, supremacist, white, snow, trash, devil, jack, fascist, genocide"
women,"hoes, sexist, woman, women, feminist, feminists, hoe, bitches, slut, feminism"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"hoes, dyke, transgender, faggot, fag, sexist, sexual, lesbian, hoe, dykes"
race/ethnicity,"chinese, black, blacks, asian, asians, mexicans, whites, africa, supremacists, supremacist"
religion,"catholic, priest, catholics, christians, christian, christianity, religion, church, jesus, koran"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, allcaps, christianity, christian, supremacist, virgin, church"
marginalized,"muslims, muslim, woman, she, black, nigger, women, her, islam, jews"


100 smoothing


Unnamed: 0_level_0,Top terms
Identities,Unnamed: 1_level_1
asian_people,"chinese, asian, china, asians, ching, japanese, chong, chinaman, chink, indian"
black_people,"black, niggas, nigga, nigger, blacks, blm, africa, african, negro, cops"
christians,"priest, catholic, christian, christians, jesus, bible, priests, christianity, catholics, church"
jews,"jewish, jews, jew, holocaust, kike, hitler, israel, gas, oven, zionist"
latinx_people,"mexicans, mexican, latino, latinos, mexico, spic, hispanic, hispanics, beaner, beaners"
lgbtq+_people,"transgender, trans, transgendered, gay, transgenders, faggot, faggots, queers, fag, bisexual"
men,"boys, men, man, males, male, negative, hunt, sex, don, self"
muslims_and_arabic_middle_eastern_people,"muslim, islam, muslims, islamic, terrorist, isis, terrorists, saudi, moslem, bomb"
white_people,"redneck, supremacists, white, supremacist, trash, snow, whites, genocide, devil, nazi"
women,"hoes, woman, women, sexist, her, bitches, feminist, feminists, hoe, bitch"


Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"hoes, faggot, dyke, woman, transgender, fag, sexual, sexist, women, female"
race/ethnicity,"black, blacks, chinese, whites, asian, asians, white, racism, africa, racist"
religion,"catholic, christians, christian, priest, catholics, christianity, religion, church, islam, jesus"


Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, christian, allcaps, christianity, supremacist, church, white"
marginalized,"muslims, muslim, black, she, women, woman, her, nigger, in, to"


In [15]:
# Just look at an individual run's output
# Format for LaTeX

import os
import pandas as pd
from IPython.display import display

split_types = ['identities', 'categories', 'power']
vocab_size = 3000
n_words = 10
smoothing = 20
for split_type in split_types:
    sage_outpath = f'/home/mamille3/SAGE/py-sage/output/{split_type}_{n_words}words_{vocab_size}vocab_{smoothing}smoothing.csv'
    results = pd.read_csv(sage_outpath, sep='\t')
    # results['split'] = results.source.str.slice(len(f'input/{split_type}/'), -1*(len('_sents.txt')))
    results['split'] = results.source.map(lambda x: os.path.basename(x)[:-1*(len('_sents.txt'))])
    # results

    pd.set_option('display.max_colwidth', None)
    top_terms = results.groupby('split').agg({'word': ', '.join})
    if split_type == 'identities':
        if 'people of color' in top_terms.index:
            top_terms.drop('people_of_color', inplace=True)
        top_terms.index = top_terms.index.str.replace('_', ' ').str.replace(' people', '')
        top_terms.index = top_terms.index.str.capitalize()
    elif split_type == 'categories':
        top_terms.index = top_terms.index.str.replace('_', '/')
    top_terms.index.name = split_type.capitalize()
    top_terms.columns = ['Top terms']
    if split_type == 'identities':
        top_terms = top_terms.rename(index={'Lgbtq+': 'LGBTQ+', 'Muslims and arabic middle eastern': 'Muslims, Arabs'})
    display(top_terms)

    latex = top_terms.style.to_latex(hrules=True, label=f'{split_type}_sage', caption=f'Most representative terms in corpora divided by target identity {split_type} from SAGE', environment='table*')
    latex = latex.replace(' & Top terms', f'{split_type.capitalize()} & Top terms').replace(f'{split_type.capitalize()} &  \\\\\n', '').replace(' people', '').replace('hegemonic', 'dominant')
    print(latex)

Unnamed: 0_level_0,Top terms
Identities,Unnamed: 1_level_1
Asian,"chinese, ching, chong, china, chinaman, asian, japanese, asians, japan, chink"
Black,"niggas, black, nigga, ethiopian, blm, negro, chicago, nigger, africa, nfl"
Christians,"priest, catholic, bible, priests, christian, vatican, jesus, christianity, christians, catholics"
Jews,"jewish, oven, holocaust, gas, jew, jews, hitler, zionists, chamber, zionist"
Latinx,"latino, latinos, beaner, mexico, mexicans, mexican, spic, latin, beaners, hispanic"
LGBTQ+,"transgender, transgendered, trans, transgenderism, bisexual, transgenders, disorder, queers, bathrooms, bathroom"
Men,"hunt, negative, boys, men, man, males, despicable, relationships, title, easier"
"Muslims, Arabs","islam, akbar, allahu, muslim, islamic, mohammed, saudi, terrorist, muslims, isis"
White,"redneck, supremacists, snow, supremacist, white, jack, devil, fascist, trash, rage"
Women,"hoes, sexist, feminists, feminist, hoe, feminism, woman, slut, women, bitches"


\begin{table*}
\caption{Most representative terms in corpora divided by target identity identities from SAGE}
\label{identities_sage}
\begin{tabular}{ll}
\toprule
Identities & Top terms \\
\midrule
Asian & chinese, ching, chong, china, chinaman, asian, japanese, asians, japan, chink \\
Black & niggas, black, nigga, ethiopian, blm, negro, chicago, nigger, africa, nfl \\
Christians & priest, catholic, bible, priests, christian, vatican, jesus, christianity, christians, catholics \\
Jews & jewish, oven, holocaust, gas, jew, jews, hitler, zionists, chamber, zionist \\
Latinx & latino, latinos, beaner, mexico, mexicans, mexican, spic, latin, beaners, hispanic \\
LGBTQ+ & transgender, transgendered, trans, transgenderism, bisexual, transgenders, disorder, queers, bathrooms, bathroom \\
Men & hunt, negative, boys, men, man, males, despicable, relationships, title, easier \\
Muslims, Arabs & islam, akbar, allahu, muslim, islamic, mohammed, saudi, terrorist, muslims, isis \\
White & redneck, su

Unnamed: 0_level_0,Top terms
Categories,Unnamed: 1_level_1
gender/sexuality,"dyke, hoes, transgender, fag, dykes, lesbian, trans, sexist, queer, hoe"
race/ethnicity,"chinese, ethiopian, asian, black, mexicans, blacks, asians, hispanic, africa, supremacists"
religion,"catholic, priest, catholics, christians, christian, christianity, evangelicals, bishops, rabbi, clergy"


\begin{table*}
\caption{Most representative terms in corpora divided by target identity categories from SAGE}
\label{categories_sage}
\begin{tabular}{ll}
\toprule
Categories & Top terms \\
\midrule
gender/sexuality & dyke, hoes, transgender, fag, dykes, lesbian, trans, sexist, queer, hoe \\
race/ethnicity & chinese, ethiopian, asian, black, mexicans, blacks, asians, hispanic, africa, supremacists \\
religion & catholic, priest, catholics, christians, christian, christianity, evangelicals, bishops, rabbi, clergy \\
\bottomrule
\end{tabular}
\end{table*}



Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"priest, catholic, priests, jesus, allcaps, christianity, virgin, christian, supremacist, pope"
marginalized,"muslims, muslim, woman, she, nigger, islam, black, her, women, jews"


\begin{table*}
\caption{Most representative terms in corpora divided by target identity power from SAGE}
\label{power_sage}
\begin{tabular}{ll}
\toprule
Power & Top terms \\
\midrule
dominant & priest, catholic, priests, jesus, allcaps, christianity, virgin, christian, supremacist, pope \\
marginalized & muslims, muslim, woman, she, nigger, islam, black, her, women, jews \\
\bottomrule
\end{tabular}
\end{table*}



# Run SAGE with different settings
Tried to run it directly from Python script instead of through a shell command, but it's Python 2.7

In [1]:
import os
from tqdm.notebook import tqdm
os.chdir('/home/mamille3/SAGE/py-sage/')
from runSage import runSage

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(s.its,'/',s.max_its,change)? (deltaIterator.py, line 16)

In [None]:
vocab_settings = [1500, 3000, 5000]
for vocab_setting in tqdm(vocab_settings):
    cmd = 