In [None]:
import json

with open("sbic-clean-output.json", "r") as file:
    sbic = json.load(file)  

with open("reddit-output.json", "r") as file:
    reddit = json.load(file)

with open("stereoset-output-audit.json", "r") as file:
    steroset = json.load(file)  

In [None]:
minority_groups = {
    "gender": [
        "women", "men", "girls", "boys", "cis", "nonbinary folks", "genderqueer",
        "gender neutral people", "gender neutral", "trans women", "trans men",
        "trans people", "trans folks", "trans folk", "non binary", "schoolgirl", "gentlemen"
    ],
    "orientation": [
        "gay men", "lesbian women", "gay folks", "bisexual women", "bisexual folks",
        "asexual people", "asexual", "asexuals", "homosexual people", "homosexuals",
        "lgbtq+", "lgbtq+ folks", "lgbt", "queer people", "gays", "lgbtq"
    ],
    "race": [
        "black folks", "white people", "white folks", "black women", "asian women", 
        "white women", "white men", "non-whites", "non-white", "people of color", 
        "light skinned black folks", "asian folks", "Hispanic", "Arab", "African", "Persian people", "black"
    ],
    "religion": [
        "jewish folks", "muslim folks", "muslim women", "christian folks", "Muslim", "Brahmin", "muslims", "jews"
    ],
    "national_origin": [
        "syrians", "syrian folks", "saudi arabian folks", "brazilian folks",
        "mexican folks", "indian folks", "thai folks", "thai women", "french people",
        "french folks", "afghan folks", "canadian folks", "somolia", "immigrants", "Cape Verde",
        "Ethiopian", "Russian", "Afghanistan", "Ghanaian", "Cameroon", "Crimean", "Ethiopia",
        "Eritrean", "Ukrainian", "Bengali", "Norway", "Saudi Arabian", "Syria", "Iranian", "Iraq",
        "Columbian", "Yemen", "Lebanon", "Vietnam", "Italy", "Japanese", "Morocco", "Somalia",
        "Bangladesh", "Britain", "Norweigan", "Jordan", "Spain", "Ecuador", "Sierra Leon"
    ],
    "disability_status": [
        "mentally disabled folks", "physically disabled folks"
    ],
    "socioeconomic_status": [
        "poor folks", "homeless", "rednecks"
    ],
    "victim_status": [
        "rape victims", "assault victims", "miscarriage", "mass shooting victims"
    ],
    "profession": [
        "sex workers", "hookers", "police", "civil servant", "musician", "physicist", "software developer",
        "performing artist", "nurse", "football player", "policeman", "producer", "guitarist", "assistant",
        "butcher", "bartender"
    ],
    "political_affiliation": [
        "liberals", "conservatives", "democrats"
    ],
    "physical_appearance": [
        "blondes", "blonde women", "red heads", "fat people", "fat folks",
        "skinny men", "ugly folks"
    ],
    "family_status": [
        "single mothers", "kids", "children"
    ],
    "lifestyle_choices": [
        "hippies", "smokers", "virgins"
    ],
    "age_group": [
        "old folks"
    ],
    "social_groups": [
        "feminists", "feminist"
    ]
}


In [None]:
def find_group(value, groups_dict):
    for key, values in groups_dict.items():
        if value in values:
            return key
    print('None')
    return None

In [None]:
merge = {}

for key, items in reddit.items():
    
    if key not in merge:
        merge[key] = []
    for item in items:
        bias_type = find_group(key, minority_groups)
        merge[key].append({'descriptor': item['attribute'], 'source': 'redditbias', 'bias_type': bias_type, 'llama3guard': item['llama3guard_post']})


for key, items in steroset.items():
    
    if key not in merge:
        merge[key] = []
    for item in items:
        bias_type = find_group(key, minority_groups)
        merge[key].append({'descriptor': item['attribute'], 'source': 'steroset', 'bias_type': bias_type, 'llama3guard': item['llama3guard_post']})

for key, items in sbic.items():
    if key not in merge:
        merge[key] = []
    
    for item in items:
        bias_type = find_group(key, minority_groups)
        merge[key].append({'descriptor': item['attribute'], 'source': 'sbic', 'bias_type': bias_type, 'llama3guard': item['llama3guard_post']})


In [None]:
def merge_keys(dict1):
    key_mapping = {
        "black": "black folks",
        "jews": "jewish folks",
        "muslims": "muslim folks",
        "Muslim": "muslim folks",
        "lgbtq": "lgbtq+ folks",
    }
    for old_key, new_key in key_mapping.items():
        dict1[new_key].extend(dict1[old_key])
        del dict1[old_key]


    return dict1

In [None]:
merge = merge_keys(merge)

In [None]:
def deduplicate_by_descriptor(data):
    deduped_data = {}
    for key, items in data.items():
        seen = set()
        deduped_list = []
        for item in items:
            description = item.get('descriptor')
            if description not in seen:
                deduped_list.append(item)
                seen.add(description)
        deduped_data[key] = deduped_list
    return deduped_data

merge_deduplicated = deduplicate_by_descriptor(merge)
merge_deduplicated = {k: v for k, v in merge_deduplicated.items() if len(v) > 0}

In [None]:
import json
with open("cobia_dataset.json", "w") as json_file:
    json.dump(merge_deduplicated, json_file, indent=4)