# Run PMI to find most associated words with identity terms in each corpus split

## Load, process data, calculate cooccurrences

In [2]:
# Load data (tweet texts)
import os

split_type = '0_pro_anti_bot_human'
dirpath = os.path.join('/home/huixiann/2022_socialbias_vaccine/michael/SAGE/py-sage/input/', split_type)
processed = {}

def remove_tok(tok):
    return tok.startswith('http') or tok.startswith('#') or tok.isnumeric()

for fname in sorted(os.listdir(dirpath)):
    fpath = os.path.join(dirpath, fname)
    with open(fpath, 'r') as f:
        processed[fname.split('.')[0]] = [[tok for tok in doc.split() if not remove_tok(tok)] for doc in f.read().splitlines()]
        
processed.keys()

dict_keys(['0_anti-bot_sents', '0_anti-human_sents', '0_pro-bot_sents', '0_pro-human_sents'])

In [13]:
# Build dictionary of raw word co-occurrences (co-occur if the words occur in the same document)
from collections import defaultdict, Counter
import itertools
from multiprocessing import Pool
from tqdm.notebook import tqdm

freq_threshold = 1000

def process_section(section_parts):
    name, section = section_parts
    print(name)
    sec_cooccurrences = defaultdict(int) # (word1, word2): n_times_co-occurs
    sec_combination_word_freqs = defaultdict(int) # word1: n_times_occurs_in_combinations
    sec_total_combinations = 0

    counts = Counter([w for doc in section for w in doc])
    sec_word_freqs = Counter({k: c for k,c in counts.items() if c >= freq_threshold})
    
    for doc in tqdm(section):
        doc_toks = [w for w in doc if w in sec_word_freqs] # filters by freq

        for pair in list(itertools.combinations(doc_toks, 2)):
            sec_cooccurrences[tuple(sorted(pair))] += 1
            sec_combination_word_freqs[pair[0]] += 1
            sec_combination_word_freqs[pair[1]] += 1
            sec_total_combinations += 1
            
    return name, sec_cooccurrences, sec_word_freqs, sec_combination_word_freqs, sec_total_combinations

with Pool(len(processed)) as p:
    results = list(tqdm(p.imap(process_section, sorted(processed.items())), total=len(processed)))

cooccurrences = {}
word_freqs = {}
combination_word_freqs = {}
total_combinations = {}

for result in results:
    name, sec_cooccurrences, sec_word_freqs, sec_combination_word_freqs, sec_total_combinations = result    
    cooccurrences[name] = sec_cooccurrences
    word_freqs[name] = sec_word_freqs
    combination_word_freqs[name] = sec_combination_word_freqs
    total_combinations[name] = sec_total_combinations

  0%|          | 0/4 [00:10<?, ?it/s]

0_anti-bot_sents


  0%|          | 0/1468918 [00:00<?, ?it/s]

0_anti-human_sents


  0%|          | 0/2018806 [00:00<?, ?it/s]

0_pro-bot_sents


  0%|          | 0/1921921 [00:00<?, ?it/s]

0_pro-human_sents


  0%|          | 0/3042223 [00:00<?, ?it/s]

In [14]:
from operator import itemgetter
import math

def pmi(words, word_freqs, cooccurrences, n):
    numerator = n * cooccurrences[words]
    if numerator == 0:
        return 0
    denominator = word_freqs[words[0]] * word_freqs[words[1]]
    return math.log(numerator/denominator, 2)

def top_pmi(word, word_freqs, cooccurrences, n):
    # Returns top co-occurring words with a specified word based on PMI
    
    cooccurring_words = []
    
    pairs = [pair for pair in cooccurrences.keys() if word in pair and pair != (word, word)]  # all words that co-occur
    
    for pair in pairs:
        other_word = [w for w in pair if w != word][0]
        cooccurring_words.append((other_word, pmi(pair, word_freqs, cooccurrences, n)))
        
    return sorted(cooccurring_words, key=itemgetter(1), reverse=True)

## View top associated terms with terms of interest

In [5]:
# Load identity terms (terms of interest)
import json

identities_fpath = '../identities.json'
with open(identities_fpath) as f:
    identities = json.load(f)
identities.keys()

dict_keys(['gender/sexuality', 'age', 'race/ethnicity/nationality', 'religion', 'class', 'medical'])

In [15]:
# Look at PMI for high-frequency terms
high_freq_terms = [
    'american', 
    'americans',
    'old',
    'children',
    'indian',
    'indians',
    'young',
    'white',
    'man',
    'black',
    'poor',
    'elderly',
    'rich',
    'rural',
    'urban',
    'mom',
    'youth', 
    'father'
]
for term in high_freq_terms:
    print(term)
    for name in processed:
        outstring = ', '.join([el[0] for el in top_pmi(term, combination_word_freqs[name], cooccurrences[name], total_combinations[name])[:5]])
        print(f'{name}: {outstring}')
    print()

american
0_anti-bot_sents: neighbors, latin, yourself, ye, cuba
0_anti-human_sents: cuba's, praised, latin, cuba, neighbors
0_pro-bot_sents: rescue, weird, neighbors, exchange, à¸«à¸™
0_pro-human_sents: buildi, assets, reserves, weird, embassy

americans
0_anti-bot_sents: worried, missed, black, database, killed
0_anti-human_sents: extend, cards, worried, easier, obligation
0_pro-bot_sents: aug, racial, snapshot, miles, easier
0_pro-human_sents: aug, lagging, safer, hrs, racial

old
0_anti-bot_sents: mom, nurse, conditions, mother, healthy
0_anti-human_sents: club, student, volunteer, mom, nurse
0_pro-bot_sents: struggle, toronto, east, stealing, admits
0_pro-human_sents: 16-59, minutes, toronto, east, kagan

children
0_anti-bot_sents: zydus, ages, ðŸ¤¬, st, started
0_anti-human_sents: embarrassing, inject, name, perspective, vector
0_pro-bot_sents: ages, decline, upcoming, young, services
0_pro-human_sents: zydus, aborted, lifesaving, decline, plea

indian
0_anti-bot_sents: troopers, 

In [16]:
# Top associated words with all terms
for cat in identities:
    print(cat)
    for term in identities[cat]:
        print(term)
        for name in processed:
            outstring = ', '.join([el[0] for el in top_pmi(term, combination_word_freqs[name], cooccurrences[name], total_combinations[name])[:5]])
            print(f'{name}: {outstring}')
        print()

gender/sexuality
woman
0_anti-bot_sents: paused, dies, goes, condition, year-old
0_anti-human_sents: paused, suspected, condition, oxford-astrazeneca, goes
0_pro-bot_sents: significant, inside, role, pharmacy, designed
0_pro-human_sents: saving, phenomenal, significant, role, st

women
0_anti-bot_sents: pregnant, safely, warns, recommends, misinformation
0_anti-human_sents: brillian, fear-mongering, pregnant, breastfeeding, nursing
0_pro-bot_sents: pregnant, men, benefits, nor, raise
0_pro-human_sents: pregnant, determine, raise, benefits, space

man
0_anti-bot_sents: harm, arrested, police, propaganda, concerns
0_anti-human_sents: harm, arrested, police, speaks, closed
0_pro-bot_sents: staying, remarkable, transporting, year-old, young
0_pro-human_sents: midnight, appointmen, tw, unfortunately, staying

men
0_anti-bot_sents: billionaires, became, vi, industry, lockdowns
0_anti-human_sents: poverty, whic, became, billionaires, vi
0_pro-bot_sents: remarkable, gap, they're, poll, whole
0