# Run PMI to find most associated words with identity terms in each corpus split

## Load, process data, calculate cooccurrences

In [9]:
# Load data (tweet texts)
import os

split_type = '0_pro_anti_bot_human'
dirpath = os.path.join('/home/huixiann/2022_socialbias_vaccine/michael/SAGE/py-sage/input/', split_type)
processed = {}
for fname in sorted(os.listdir(dirpath)):
    fpath = os.path.join(dirpath, fname)
    with open(fpath, 'r') as f:
        processed[fname.split('.')[0]] = [[tok for tok in doc.split() if not tok.startswith('http')] for doc in f.read().splitlines()]
        
processed.keys()

dict_keys(['0_anti-bot_sents', '0_anti-human_sents', '0_pro-bot_sents', '0_pro-human_sents'])

In [12]:
# Build dictionary of raw word co-occurrences (co-occur if the words occur ../identities.jsone same document)
from collections import defaultdict, Counter
import itertools
from multiprocessing import Pool
from tqdm.notebook import tqdm

freq_threshold = 100

def process_section(section_parts):
    name, section = section_parts
    print(name)
    sec_cooccurrences = defaultdict(int) # (word1, word2): n_times_co-occurs
    sec_combination_word_freqs = defaultdict(int) # word1: n_times_occurs_in_combinations
    sec_total_combinations = 0

    counts = Counter([w for doc in section for w in doc])
    sec_word_freqs = Counter({k: c for k,c in counts.items() if c >= freq_threshold})
    
    for doc in tqdm(section):
        doc_toks = [w for w in doc if w in sec_word_freqs] # filters by freq

        for pair in list(itertools.combinations(doc_toks, 2)):
            sec_cooccurrences[tuple(sorted(pair))] += 1
            sec_combination_word_freqs[pair[0]] += 1
            sec_combination_word_freqs[pair[1]] += 1
            sec_total_combinations += 1
            
    return sec_cooccurrences, sec_word_freqs, sec_combination_word_freqs, sec_total_combinations

with Pool(len(processed)) as p:
    results = list(tqdm(p.imap(process_section, sorted(processed.items())), total=len(processed)))

cooccurrences = {}
word_freqs = {}
combination_word_freqs = {}
total_combinations = {}

In [13]:
for result, section_parts in zip(results, processed.items()):
    sec_cooccurrences, sec_word_freqs, sec_combination_word_freqs, sec_total_combinations = result
    name = section_parts[0]
    
    cooccurrences[name] = sec_cooccurrences
    word_freqs[name] = sec_word_freqs
    combination_word_freqs[name] = sec_combination_word_freqs
    total_combinations[name] = sec_total_combinations

# Old, without multiprocessing
# for name, section in processed.items():
#     print(name)
#     cooccurrences[name] = defaultdict(int) # (word1, word2): n_times_co-occurs
#     combination_word_freqs[name] = defaultdict(int) # word1: n_times_occurs_in_combinations
#     total_combinations[name] = 0

#     counts = Counter([w for doc in section for w in doc])
#     word_freqs[name] = Counter({k: c for k,c in counts.items() if c >= freq_threshold})
# #     word_freqs[name] = Counter({k: c for k,c in counts.items() if c >= cooccurrence_min_freq}) # must occur that many times by themselves
    
#     for doc in tqdm(section):
#         doc_toks = [w for w in doc if w in word_freqs[name]] # filters by freq

#         for pair in list(itertools.combinations(doc_toks, 2)):
#             cooccurrences[name][tuple(sorted(pair))] += 1
#             combination_word_freqs[name][pair[0]] += 1
#             combination_word_freqs[name][pair[1]] += 1
#             total_combinations[name] += 1

# #     cooccurrences[name] = {pair: count for pair, count in cooccurrences[name].items() if count >= cooccurrence_min_freq}
    
#     print(len(combination_word_freqs[name]))
#     print(len(cooccurrences[name]))
#     print()

In [14]:
from operator import itemgetter
import math

def pmi(words, word_freqs, cooccurrences, n):
    numerator = n * cooccurrences[words]
    if numerator == 0:
        return 0
    denominator = word_freqs[words[0]] * word_freqs[words[1]]
    return math.log(numerator/denominator, 2)

def top_pmi(word, word_freqs, cooccurrences, n):
    # Returns top co-occurring words with a specified word based on PMI
    
    cooccurring_words = []
    
    pairs = [pair for pair in cooccurrences.keys() if word in pair and pair != (word, word)]  # all words that co-occur
    
    for pair in pairs:
        other_word = [w for w in pair if w != word][0]
        cooccurring_words.append((other_word, pmi(pair, word_freqs, cooccurrences, n)))
        
    return sorted(cooccurring_words, key=itemgetter(1), reverse=True)

## View top associated terms with terms of interest

In [7]:
# Load identity terms (terms of interest)
import json

identities_fpath = '../identities.json'
with open(identities_fpath) as f:
    identities = json.load(f)
identities.keys()

dict_keys(['gender/sexuality', 'age', 'race/ethnicity/nationality', 'religion', 'class', 'medical'])

In [15]:
for cat in identities:
    print(cat)
    for term in identities[cat]:
        print(term)
        for name in processed:
            outstring = ', '.join([el[0] for el in top_pmi(term, combination_word_freqs[name], cooccurrences[name], total_combinations[name])[:5]])
            print(f'{name}: {outstring}')
        print()

gender/sexuality
woman
0_anti-bot_sents: igbo, inspires, yogesh, serio, equ
0_anti-human_sents: igbo, serio, equ, nigerian, seizures
0_pro-bot_sents: serio, supermarket, immunologist, lakhs, peddling
0_pro-human_sents: seizures, pleads, 6am, trek, oakeshott

women
0_anti-bot_sents: malian, 🤥, #vaccinefor, brillian, #pmnarendramodi
0_anti-human_sents: childbearing, #womenshea, brillian, fear-mongering, post-menopausal
0_pro-bot_sents: consult, lactating, periods, insufficient, grannies
0_pro-human_sents: s-only, consult, kenyan, sexuall, #pregnant

man
0_anti-bot_sents: shepherd, missile, undergoing, burning, buddy
0_anti-human_sents: connaught, mulroney, capillary, missile, hesitated
0_pro-bot_sents: erroneously, laquitta, criticize, willis, appointmen
0_pro-human_sents: vara, laquitta, willis, deceased, #worldpraisepmikpolicies

men
0_anti-bot_sents: poverty, whic, prostate, ramai, bijwerkingen
0_anti-human_sents: unknowingly, poverty, whic, beaten, syphilis
0_pro-bot_sents: uss, serv