# Prepare corpora for SAGE
Then run it at ~/SAGE/py-sage (repo fork at https://github.com/michaelmilleryoder/SAGE/tree/master/py-sage)

In [1]:
# Load corpora/splits
import pickle
import os

data_dirpath = '/home/mamille3/hegemonic_hate/data'
# splits_fpath = os.path.join(data_dirpath, 'combined_identity_splits_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
split_type = 'power'
splits_fpath = os.path.join(data_dirpath, f'combined_{split_type}_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
with open(splits_fpath, 'rb') as f:
    corpora = pickle.load(f)
    
corpora.keys()

dict_keys([('hegemonic',), ('marginalized',)])

In [2]:
import pandas as pd
from tqdm.notebook import tqdm
import spacy
import re

texts = {}
nlp = spacy.load('en_core_web_sm', exclude=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
nlp.add_pipe('sentencizer')

split_transform = {
    ('race/ethnicity',): 'race/ethnicity',
    ('religion',): 'religion',
    ('gender', 'sexuality'): 'gender/sexuality',
    ('hegemonic',): 'hegemonic',
    ('marginalized',): 'marginalized',
}

# for split in list(corpora.keys())[:1]:
for split in tqdm(corpora):
    data = pd.concat([corpora[split]['train'], corpora[split]['test']])
    texts[split] = data.text.tolist()

    # Process, save out data
    processed = []
    inp = texts[split]
    # for doc in tqdm(nlp.pipe(inp), total=len(inp)):
    for doc in nlp.pipe(inp):
        sents = [' '.join([tok.text for tok in sent]).strip().lower() for sent in doc.sents]
        processed.extend(sents)

    # Save out
    out_dirpath = os.path.join(data_dirpath, split_type)
    if not os.path.exists(out_dirpath):
        os.mkdir(out_dirpath)
    outpath = os.path.join(out_dirpath, f'{re.sub(r"[ /,]", "_", split_transform.get(split, split))}_sents.txt')
    with open(outpath, 'w') as f:
        for sent in processed:
            f.write(f'{sent}\n')

  0%|          | 0/2 [00:00<?, ?it/s]

# Read, format SAGE output

In [3]:
import os
import pandas as pd

split_type = 'power'
n_words = 10
sage_outpath = f'/home/mamille3/SAGE/py-sage/output/{split_type}_{n_words}words.csv'
results = pd.read_csv(sage_outpath, sep='\t')
results['split'] = results.source.str.slice(len(f'input/{split_type}/'), -1*(len('_sents.txt')))
results

Unnamed: 0,source,word,sage,base_count,base_rate,file_count,file_rate,split
0,input/power/hegemonic_sents.txt,male,0.763573,583,0.000591,244,0.001278,hegemonic
1,input/power/hegemonic_sents.txt,men,0.728685,1999,0.002025,804,0.004212,hegemonic
2,input/power/hegemonic_sents.txt,males,0.586468,267,0.00027,95,0.000498,hegemonic
3,input/power/hegemonic_sents.txt,gender,0.556381,343,0.000347,118,0.000618,hegemonic
4,input/power/hegemonic_sents.txt,faggot,0.553751,519,0.000526,177,0.000927,hegemonic
5,input/power/hegemonic_sents.txt,sexual,0.548172,298,0.000302,102,0.000534,hegemonic
6,input/power/hegemonic_sents.txt,faggots,0.518086,193,0.000196,65,0.00034,hegemonic
7,input/power/hegemonic_sents.txt,gay,0.517563,1301,0.001318,425,0.002226,hegemonic
8,input/power/hegemonic_sents.txt,boys,0.510372,306,0.00031,101,0.000529,hegemonic
9,input/power/hegemonic_sents.txt,man,0.508329,2192,0.002221,708,0.003709,hegemonic


In [4]:
pd.set_option('display.max_colwidth', None)
top_terms = results.groupby('split').agg({'word': ', '.join})
if split_type == 'identity':
    top_terms.drop('people_of_color', inplace=True)
    top_terms.index = top_terms.index.str.replace('_', ' ')
    top_terms.index = top_terms.index.str.capitalize()
elif split_type == 'categories':
    top_terms.index = top_terms.index.str.replace('_', '/')
top_terms.index.name = split_type.capitalize()
top_terms.columns = ['Top terms']
if split_type == 'identity':
    top_terms = top_terms.rename(index={'Lgbtq+ people': 'LGBTQ+ people', 'Muslims and arabic middle eastern people': 'Muslims, Arabs'})
top_terms

Unnamed: 0_level_0,Top terms
Power,Unnamed: 1_level_1
hegemonic,"male, men, males, gender, faggot, sexual, faggots, gay, boys, man"
marginalized,"nigger, black, blacks, brown, whites, niggas, african, cops, niggers, africa"


In [5]:
latex = top_terms.style.to_latex(hrules=True, label='identity_sage', caption=f'Most representative terms in corpora divided by target identity {split_type} from SAGE', environment='table*')
latex = latex.replace(' & Top terms', 'Power & Top terms').replace(f'{split_type.capitalize()} &  \\\\\n', '')
print(latex)

\begin{table*}
\caption{Most representative terms in corpora divided by target identity power from SAGE}
\label{identity_sage}
\begin{tabular}{ll}
\toprule
Power & Top terms \\
\midrule
hegemonic & male, men, males, gender, faggot, sexual, faggots, gay, boys, man \\
marginalized & nigger, black, blacks, brown, whites, niggas, african, cops, niggers, africa \\
\bottomrule
\end{tabular}
\end{table*}

