# Prepare corpora for SAGE
Then run it at ~/SAGE/py-sage (repo fork at https://github.com/michaelmilleryoder/SAGE/tree/master/py-sage)

In [6]:
# Load corpora/splits
import pickle
import os

data_dirpath = '/home/mamille3/hegemonic_hate/data'
# splits_fpath = os.path.join(data_dirpath, 'combined_identity_splits_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
split_type = 'power'
splits_fpath = os.path.join(data_dirpath, f'combined_{split_type}_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
with open(splits_fpath, 'rb') as f:
    corpora = pickle.load(f)
    
corpora.keys()

dict_keys([('hegemonic',), ('marginalized',)])

In [7]:
import pandas as pd
from tqdm.notebook import tqdm
import spacy
import re

texts = {}
nlp = spacy.load('en_core_web_sm', exclude=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
nlp.add_pipe('sentencizer')

split_transform = {
    ('race/ethnicity',): 'race/ethnicity',
    ('religion',): 'religion',
    ('gender', 'sexuality'): 'gender/sexuality',
    ('hegemonic',): 'hegemonic',
    ('marginalized',): 'marginalized',
}

# for split in list(corpora.keys())[:1]:
for split in tqdm(corpora):
    data = pd.concat([corpora[split]['train'], corpora[split]['test']])
    texts[split] = data.text.tolist()

    # Process, save out data
    processed = []
    inp = texts[split]
    # for doc in tqdm(nlp.pipe(inp), total=len(inp)):
    for doc in nlp.pipe(inp):
        sents = [' '.join([tok.text for tok in sent]).strip().lower() for sent in doc.sents]
        processed.extend(sents)

    # Save out
    out_dirpath = os.path.join(data_dirpath, split_type)
    if not os.path.exists(out_dirpath):
        os.mkdir(out_dirpath)
    outpath = os.path.join(out_dirpath, f'{re.sub(r"[ /,]", "_", split_transform.get(split, split))}_sents.txt')
    with open(outpath, 'w') as f:
        for sent in processed:
            f.write(f'{sent}\n')

  0%|          | 0/2 [00:00<?, ?it/s]

# Read, format SAGE output

In [9]:
import os
import pandas as pd

split_type = 'power'
n_words = 10
sage_outpath = f'/home/mamille3/SAGE/py-sage/output/{split_type}_{n_words}words.csv'
results = pd.read_csv(sage_outpath, sep='\t')
# results['split'] = results.source.str.slice(len(f'input/{split_type}/'), -1*(len('_sents.txt')))
results['split'] = results.source.map(lambda x: os.path.basename(x)[:-1*(len('_sents.txt'))])
results

Unnamed: 0,source,word,sage,base_count,base_rate,file_count,file_rate,split
0,/home/mamille3/hegemonic_hate/data/power/marginalized_sents.txt,nigger,0.023297,9905,0.001287,9614,0.001324,marginalized
1,/home/mamille3/hegemonic_hate/data/power/marginalized_sents.txt,black,0.019082,34071,0.004427,32834,0.004521,marginalized
2,/home/mamille3/hegemonic_hate/data/power/marginalized_sents.txt,muslims,0.011768,8948,0.001163,8632,0.001189,marginalized
3,/home/mamille3/hegemonic_hate/data/power/marginalized_sents.txt,she,0.001832,22809,0.002964,21754,0.002995,marginalized
4,/home/mamille3/hegemonic_hate/data/power/marginalized_sents.txt,her,0.001828,21290,0.002766,20317,0.002798,marginalized
5,/home/mamille3/hegemonic_hate/data/power/marginalized_sents.txt,muslim,0.001706,11673,0.001517,11209,0.001543,marginalized
6,/home/mamille3/hegemonic_hate/data/power/marginalized_sents.txt,my,0.001463,45124,0.005864,42804,0.005894,marginalized
7,/home/mamille3/hegemonic_hate/data/power/marginalized_sents.txt,woman,0.001315,10133,0.001317,9710,0.001337,marginalized
8,/home/mamille3/hegemonic_hate/data/power/marginalized_sents.txt,blacks,0.001166,8452,0.001098,8105,0.001116,marginalized
9,/home/mamille3/hegemonic_hate/data/power/marginalized_sents.txt,women,0.000803,28431,0.003694,26940,0.003709,marginalized


In [10]:
pd.set_option('display.max_colwidth', None)
top_terms = results.groupby('split').agg({'word': ', '.join})
if split_type == 'identity':
    top_terms.drop('people_of_color', inplace=True)
    top_terms.index = top_terms.index.str.replace('_', ' ')
    top_terms.index = top_terms.index.str.capitalize()
elif split_type == 'categories':
    top_terms.index = top_terms.index.str.replace('_', '/')
top_terms.index.name = split_type.capitalize()
top_terms.columns = ['Top terms']
if split_type == 'identity':
    top_terms = top_terms.rename(index={'Lgbtq+ people': 'LGBTQ+ people', 'Muslims and arabic middle eastern people': 'Muslims, Arabs'})
top_terms

Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"catholic, jesus, christian, christians, church, supremacist, christ, trash, males, boys"
marginalized,"nigger, black, muslims, she, her, muslim, my, woman, blacks, women"


In [11]:
latex = top_terms.style.to_latex(hrules=True, label=f'{split_type}_sage', caption=f'Most representative terms in corpora divided by target identity {split_type} from SAGE', environment='table*')
latex = latex.replace(' & Top terms', f'{split_type.capitalize()} & Top terms').replace(f'{split_type.capitalize()} &  \\\\\n', '')
print(latex)

\begin{table*}
\caption{Most representative terms in corpora divided by target identity power from SAGE}
\label{power_sage}
\begin{tabular}{ll}
\toprule
Power & Top terms \\
\midrule
hegemonic & catholic, jesus, christian, christians, church, supremacist, christ, trash, males, boys \\
marginalized & nigger, black, muslims, she, her, muslim, my, woman, blacks, women \\
\bottomrule
\end{tabular}
\end{table*}

