In [1]:
import pandas as pd
import numpy as np
import json
import re
import itertools

In [2]:
# Resulting word lists of biased words
GLOBAL_DICT={'NRC_unweighted':{},
             'agency_power_unweighted':{}, 
             'male_gaucher':{}, 'female_gaucher':{}, 
             'superlative':{},
             'gender-ladenness': {}}

In [3]:
# NRC
!mkdir -p bias_lists
!wget -q -nc -O nrc.zip https://saifmohammad.com/WebDocs/VAD/NRC-VAD-Lexicon-Aug2018Release.zip
!unzip -q -n nrc.zip -d bias_lists

df=pd.read_csv('bias_lists/NRC-VAD-Lexicon-Aug2018Release/NRC-VAD-Lexicon.txt',delimiter='\t',names=['valence','arousal','dominance'])
df=df.reset_index().rename(columns={'index':'word'})
for index, row in df.iterrows():
    # Add to unweighted word list if any value > 0.75; values are between 0, 1
    if row['valence'] > 0.75 or row['arousal'] > 0.75 or row['dominance'] > 0.75:
        GLOBAL_DICT['NRC_unweighted'][row['word']] = 1  

# Power agency
!wget -q -nc -O pa.zip https://homes.cs.washington.edu/~msap/movie-bias/data/FramesAgencyPower.zip
!unzip -q -n pa.zip -d bias_lists

df=pd.read_csv('bias_lists/agency_power.csv')

def recode_string(s):
    if s=='agency_pos': return 1
    elif s=='agency_equal': return 0
    elif s=='agency_neg': return -1
    elif s=='power_agent': return 1
    elif s=='power_equal': return 0
    elif s=='power_theme': return -1
    else: return np.nan

for index, row in df.iterrows():
    if recode_string(row['agency']) == 1 or recode_string(row['power']) == 1:
        GLOBAL_DICT['agency_power_unweighted'][row['verb']] = 1

# Gaucher male, female 
# Manually copied from Gaucher, D., Friesen, J., & Kay, A. C. (2011). Evidence that gendered wording in job advertisements exists and sustains gender inequality. J
with open('bias_lists/gaucher_2011_raw_male.txt') as f:
    male = [line.rstrip().lower().replace('ⴱ','-') for line in f]

with open('bias_lists/gaucher_2011_raw_female.txt') as f:
    female = [line.rstrip().lower().replace('ⴱ','-') for line in f]

for i in male: 
    GLOBAL_DICT['male_gaucher'][i]=1

for i in female: 
    GLOBAL_DICT['female_gaucher'][i]=1

# Superlatives
!wget -q -nc -O bias_lists/superlatives.txt https://raw.githubusercontent.com/prosecconetwork/The-NOC-List/master/NOC/DATA/TSV%20Lists/superlatives.txt

df=pd.read_csv('bias_lists/superlatives.txt', sep='\t')
for w in df.Superlative:
     GLOBAL_DICT['superlative'][w]=1

# Clark genderedness
!wget -q -nc -O clark.zip https://static-content.springer.com/esm/art%3A10.3758%2FBF03195584/MediaObjects/Clark-BRM-2004.zip
!unzip -q -n clark.zip -d bias_lists

df = pd.read_csv('bias_lists/Clark-BRMIC-2004/cp2004a.txt', delim_whitespace=True)
df = df[['WORD','GEND']]
ddict = {}
for word, gend in list(df.itertuples(index=False)):
    ddict[word] = gend
GLOBAL_DICT['gender-ladenness'] = ddict

In [4]:
# Export
with open('bias_lists/all_bias_lists.json', 'w') as json_file:
    json.dump(GLOBAL_DICT, json_file)