# Prepare corpora for SAGE
Then run it at ~/SAGE/py-sage (repo fork at https://github.com/michaelmilleryoder/SAGE/tree/master/py-sage)

In [74]:
# Load identity corpora
import pickle
import os

data_dirpath = '/home/mamille3/hegemonic_hate/data'
# splits_fpath = os.path.join(data_dirpath, 'combined_identity_splits_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
split_type = 'categories'
splits_fpath = os.path.join(data_dirpath, f'combined_{split_type}_kennedy2020+sbic+hatexplain+civilcomments_0.3hate.pkl')
with open(splits_fpath, 'rb') as f:
    corpora = pickle.load(f)
    
corpora.keys()

dict_keys([('race/ethnicity',), ('religion',), ('gender', 'sexuality')])

In [76]:
import pandas as pd
from tqdm.notebook import tqdm
import spacy
import re

texts = {}
nlp = spacy.load('en_core_web_sm', exclude=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
nlp.add_pipe('sentencizer')

split_transform = {
    ('race/ethnicity',): 'race/ethnicity',
    ('religion',): 'religion',
    ('gender', 'sexuality'): 'gender/sexuality',
}

# for split in list(corpora.keys())[:1]:
for split in tqdm(corpora):
    data = pd.concat([corpora[split]['train'], corpora[split]['test']])
    texts[split] = data.text.tolist()

    # Process, save out data
    processed = []
    inp = texts[split]
    # for doc in tqdm(nlp.pipe(inp), total=len(inp)):
    for doc in nlp.pipe(inp):
        sents = [' '.join([tok.text for tok in sent]).strip().lower() for sent in doc.sents]
        processed.extend(sents)

    # Save out
    out_dirpath = os.path.join(data_dirpath, split_type)
    if not os.path.exists(out_dirpath):
        os.mkdir(out_dirpath)
    outpath = os.path.join(out_dirpath, f'{re.sub(r"[ /,]", "_", split_transform.get(split, split))}_sents.txt')
    with open(outpath, 'w') as f:
        for sent in processed:
            f.write(f'{sent}\n')

  0%|          | 0/3 [00:00<?, ?it/s]

# Read, format SAGE output

In [80]:
import os
import pandas as pd

split_type = 'categories'
n_words = 10
sage_outpath = f'/home/mamille3/SAGE/py-sage/output/{split_type}_{n_words}words.csv'
results = pd.read_csv(sage_outpath, sep='\t')
results['split'] = results.source.str.slice(len(f'input/{split_type}/'), -1*(len('_sents.txt')))
results

Unnamed: 0,source,word,sage,base_count,base_rate,file_count,file_rate,split
0,input/categories/race_ethnicity_sents.txt,chinese,1.097288,1098,0.001064,989,0.00318,race_ethnicity
1,input/categories/race_ethnicity_sents.txt,japanese,1.02071,274,0.000266,230,0.00074,race_ethnicity
2,input/categories/race_ethnicity_sents.txt,asian,1.008822,914,0.000886,754,0.002424,race_ethnicity
3,input/categories/race_ethnicity_sents.txt,asians,0.982038,975,0.000945,783,0.002518,race_ethnicity
4,input/categories/race_ethnicity_sents.txt,china,0.855379,581,0.000563,412,0.001325,race_ethnicity
5,input/categories/race_ethnicity_sents.txt,indians,0.741722,227,0.00022,145,0.000466,race_ethnicity
6,input/categories/race_ethnicity_sents.txt,korea,0.740923,197,0.000191,126,0.000405,race_ethnicity
7,input/categories/race_ethnicity_sents.txt,indian,0.684041,353,0.000342,212,0.000682,race_ethnicity
8,input/categories/race_ethnicity_sents.txt,whites,0.651464,646,0.000626,374,0.001203,race_ethnicity
9,input/categories/race_ethnicity_sents.txt,dog,0.605835,236,0.000229,132,0.000424,race_ethnicity


In [86]:
pd.set_option('display.max_colwidth', None)
top_terms = results.groupby('split').agg({'word': ', '.join})
if split_type == 'identity':
    top_terms.drop('people_of_color', inplace=True)
    top_terms.index = top_terms.index.str.replace('_', ' ')
    top_terms.index = top_terms.index.str.capitalize()
elif split_type == 'categories':
    top_terms.index = top_terms.index.str.replace('_', '/')
top_terms.index.name = split_type.capitalize()
top_terms.columns = ['Top terms']
if split_type == 'identity':
    top_terms = top_terms.rename(index={'Lgbtq+ people': 'LGBTQ+ people', 'Muslims and arabic middle eastern people': 'Muslims, Arabs'})
top_terms

SyntaxError: invalid character in identifier (3743235516.py, line 9)

In [90]:
latex = top_terms.style.to_latex(hrules=True, label='identity_sage', caption=f'Most representative terms in corpora divided by target identity {split_type} from SAGE', environment='table*')
latex = latex.replace(' & Top terms', 'Identity category & Top terms').replace(f'{split_type.capitalize()} &  \\\\\n', '')
print(latex)

\begin{table*}
\caption{Most representative terms in corpora divided by target identity categories from SAGE}
\label{identity_sage}
\begin{tabular}{ll}
\toprule
Identity category & Top terms \\
\midrule
gender/sexuality & males, male, faggot, men, man, dick, 128514, faggots, sexual, pathetic \\
race/ethnicity & chinese, japanese, asian, asians, china, indians, korea, indian, whites, dog \\
religion & isis, islam, islamic, sharia, muslims, muslim, terrorist, arabia, iran, saudi \\
\bottomrule
\end{tabular}
\end{table*}

