# Prepare corpora for SAGE
Then run it at ~/SAGE/py-sage (repo fork at https://github.com/michaelmilleryoder/SAGE/tree/master/py-sage)  
Take a look at runSage_grid.py, which runs runSage.py over multiple parameter settings

## Create subsections for SAGE of identities within corpora

In [1]:
# Load data (tweet texts)
import os

split_type = '0_pro_anti_bot_human'
dirpath = os.path.join('/home/huixiann/2022_socialbias_vaccine/michael/SAGE/py-sage/input/', split_type)
processed = {}

def remove_tok(tok):
    return tok.startswith('http') or tok.startswith('#') or tok.isnumeric()

for fname in sorted(os.listdir(dirpath)):
    fpath = os.path.join(dirpath, fname)
    with open(fpath, 'r') as f:
        processed[fname.split('.')[0]] = [[tok for tok in doc.split() if not remove_tok(tok)] for doc in f.read().splitlines()]
        
processed.keys()

dict_keys(['0_anti-bot_sents', '0_anti-human_sents', '0_pro-bot_sents', '0_pro-human_sents'])

In [2]:
# Load identity terms (terms of interest)
import json

identities_fpath = '../identities.json'
with open(identities_fpath) as f:
    identities = json.load(f)
identities.keys()

dict_keys(['gender/sexuality', 'age', 'race/ethnicity/nationality', 'religion', 'class', 'medical'])

In [14]:
# Search for identities
from tqdm.notebook import tqdm

matches = {}
matching_terms = []
for cat in tqdm(identities):
    for term in identities[cat]:
        matches[term] = {}
        for section in processed:
            matches[term][section] = [' '.join(tweet) for tweet in processed[section] if term in tweet]
        total_matches = sum([len(matches[term][section]) for section in matches[term]])
        if total_matches >= 10000:
            print(f'{term}: {total_matches}')
            matching_terms.append(term)

  0%|          | 0/6 [00:00<?, ?it/s]

woman: 14683
women: 30963
man: 26631
men: 10200
dad: 11752
children: 40471
young: 25769
old: 25086
elderly: 13614
aged: 48255
indian: 33929
black: 34157
white: 40551
american: 73938
americans: 52698
african: 27791
rich: 11951
poor: 13573


In [13]:
# Save out
dirpath = os.path.join('/home/huixiann/2022_socialbias_vaccine/michael/SAGE/py-sage/input/0_pro_anti_bot_human_byidentity')
if not os.path.exists(dirpath):
    os.mkdir(dirpath)

for term in matches:
    out_dirpath = os.path.join(dirpath, term)
    if not os.path.exists(out_dirpath):
        os.mkdir(out_dirpath)
    for section in matches[term]:
        outpath = os.path.join(out_dirpath, '{}_{}.txt'.format(section, term))
        with open(outpath, 'w') as f:
            for sent in matches[term][section]:
                f.write(sent + '\n')

## From JSON -> text file input

In [1]:
import json
import os
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm
from multiprocessing import Pool

In [10]:
def process_tweet_dump(section):
    """ Extract text and tokenize a set of tweet dumps """
    
    texts = []
#     dirpath = os.path.join('/home/huixiann/2022_socialbias_vaccine/', section)    
    dirpath = os.path.join('/home/huixiann/2022_socialbias_vaccine/1_bycontinent', section)
    for fname in tqdm(os.listdir(dirpath)):
        if fname.startswith('.'):
            continue
        fpath = os.path.join(dirpath, fname)
        with open(fpath) as f:
            lines = f.read().splitlines()
            for line in lines:
                try:
                    tweet = json.loads(line)
                except ValueError:
                    continue
                text = ' '.join(tokenizer.tokenize(tweet['text'])).lower()
                texts.append(text)

    # Save out
    out_dirpath = os.path.join('../input/1_bycontinent')
    if not os.path.exists(out_dirpath):
        os.mkdir(out_dirpath)
    outpath = os.path.join(out_dirpath, '{}_sents.txt'.format(section))
    with open(outpath, 'w') as f:
        for sent in texts:
            f.write(sent.encode('utf8') + '\n')

In [11]:
# Explore json tweets Lynnette files
tokenizer = TweetTokenizer(strip_handles=True)
# pool = Pool(4)
# sections = ['0_anti-bot', '0_anti-human', '0_pro-bot', '0_pro-human']
pool = Pool(8)
# sections = sorted(os.listdir('/home/huixiann/2022_socialbias_vaccine/1_bycontinent/'))
sections = ['others']

# list(tqdm(pool.imap(process_tweet_dump, sections), total=len(sections)))
list(map(process_tweet_dump, sections)) # for debugging

100%|██████████| 56/56 [00:48<00:00,  1.15it/s]


[None]

In [None]:
# for section in ['0_anti-bot', '0_anti-human', '0_pro-bot', '0_pro-human']:
#     print(section)
#     texts = []
#     dirpath = os.path.join('/home/huixiann/2022_socialbias_vaccine/', section)
#     for fname in tqdm(os.listdir(dirpath)):
#         if fname.startswith('.'):
#             continue
#         fpath = os.path.join(dirpath, fname)
#         with open(fpath) as f:
#             lines = f.read().splitlines()
#             for line in lines:
#                 tweet = json.loads(line)
#                 text = ' '.join(tokenizer.tokenize(tweet['text'])).lower()
#                 texts.append(text)

#     # Save out
#     out_dirpath = os.path.join('../input')
#     if not os.path.exists(out_dirpath):
#         os.mkdir(out_dirpath)
#     outpath = os.path.join(out_dirpath, '{}_sents.txt'.format(section))
#     with open(outpath, 'w') as f:
#         for sent in texts:
#             f.write(sent.encode('utf8') + '\n')