In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
import random
import re

def get_identities(train_path, test_path):
    '''
        input: path to full identity list
        output: returns 2 random identity lists: training and test
    '''
    train_identities_raw = pd.read_csv(train_path)
    test_identities = pd.read_csv(test_path)
    
    train_identities_listed = train_identities_raw.to_numpy().tolist()

    train_identities = []
    for i in train_identities_listed:
        train_identities.append(i[0])
        
    return train_identities, test_identities

In [2]:
def generate_synthetic_data(toxic, path, identity_list):
    '''
    input: 
        boolean toxic (true if we want toxic data)
        path to data set
        identity list we want to use (test or train)
    output: returns modified dataframe
    
    This is specific to synthetic data sets (very small difference with 
    original datasets labels and toxicity measurements).
    '''
    df = pd.read_csv(path)

    sentences = []
    a = []
    toxicity = []

    for row_index in tqdm(range(len(df))):
        comment_text = df.iloc[row_index]['Text'].split()
        if toxic:
            if df.iloc[row_index]['Label'] == "BAD" and len(set(identity_list).intersection(comment_text)) != 0:
                identity = str(set(identity_list).intersection(comment_text).pop())
                sentences.append(df.iloc[row_index]['Text'])
                toxicity.append(0)
                cur_a = []

                for diff_identity in identity_list:
                    cur_a.append(df.at[row_index, "Text"].replace(identity, diff_identity))
                a.append(cur_a)
        else:
            if df.iloc[row_index]['Label'] == "NOT_BAD" and len(set(identity_list).intersection(comment_text)) != 0:
                identity = str(set(identity_list).intersection(comment_text).pop())
                sentences.append(df.iloc[row_index]['Text'])
                toxicity.append(0)
                cur_a = []

                for diff_identity in identity_list:
                    cur_a.append(df.at[row_index, "Text"].replace(identity, diff_identity))
                a.append(cur_a)

    return_df_raw = pd.DataFrame(list(zip(*a)))
    return_df = return_df_raw.T
    return_df.insert(0, column='comment_text', value=sentences)

    return return_df

In [5]:

def main():
    train_ids, test_ids = get_identities("../data/train_identities.txt", "../data/test_identities.txt")

    synthetic_toxic_1 = generate_synthetic_data(True, "../data/bias_madlibs_89k.csv", train_ids)
    synthetic_nontoxic_1 = generate_synthetic_data(False, "../data/bias_madlibs_89k.csv", train_ids)
    
    synthetic_toxic_2 = generate_synthetic_data(True, "../data/bias_madlibs_77k.csv", train_ids)
    synthetic_nontoxic_2 = generate_synthetic_data(False, "../data/bias_madlibs_77k.csv", train_ids)
    
    synthetic_toxic_1.to_csv(Path("../data/synthetic/synthetic_toxic_df_1.csv"))
    synthetic_nontoxic_1.to_csv(Path("../data/synthetic/synthetic_nontoxic_df_1.csv"))
    
    synthetic_toxic_2.to_csv(Path("../data/synthetic/synthetic_toxic_df_2.csv"))
    synthetic_nontoxic_2.to_csv(Path("../data/synthetic/synthetic_nontoxic_df_2.csv"))

In [6]:
main()

100%|██████████| 89483/89483 [00:20<00:00, 4282.64it/s]
100%|██████████| 89483/89483 [00:20<00:00, 4360.88it/s]
100%|██████████| 76564/76564 [00:17<00:00, 4396.20it/s]
100%|██████████| 76564/76564 [00:17<00:00, 4273.89it/s]
