In [10]:
# =============================================
# Written by Shea Brown & .....
# Copyright (C) 2018 BABL AI Inc.
# =============================================
import csv
import numpy as np
import pandas as pd

def add_name_corpus(equity_df,names,ethnicity,gender):
    """ 
    This function will produce a block of the Equity Evaluation Corpus
    (EEC: http://saifmohammad.com/WebPages/Biases-SA.html) in the form of a
    pandas data frame, and append it onto the original EEC
    
    Parameters
    -----------
    equity_df: dataframe
        A dataframe of the original (or previously modified) EEC
    names: list
        A list of strings that are the names you'd like to generate scentences 
        for, e.g., ['Ali','Ahmad','Amir']
    ethnicity: str
        A string with the ethnicity associated with the names. This need not be
        an actual ethnicity, but it will be placed in the 'Race' column from the EEC
    gender: str
        A string with the gender you'd like to associate with the list of names. 
        It must be either 'male' or 'female' (see add_pronoun_corpus() for nonbinary options)
        
    Returns
    --------
    dataframe
        The input EEC plus 144 x (number of names) additional rows.  
    """
    
    # Create a single male or female block of the corpus to modify
    # ------------------------------------------------------------
    if gender == 'female':
        new_equity_df = equity_df.copy()
        ebony = new_equity_df['Person'] == 'Ebony'
        block = new_equity_df[ebony]
        
    elif gender == 'male':
        new_equity_df = equity_df.copy()
        alonzo = new_equity_df['Person'] == 'Alonzo'
        block = new_equity_df[alonzo]

    for name in names:
        for index, row in block.iterrows():
            scent = row['Sentence']
            person = row['Person']
            scent = scent.replace(person,name)
            row['Sentence'] = scent
            row['Person'] = name
            row['Race'] = ethnicity
        equity_df = pd.concat([equity_df, block], ignore_index=True)   
    return equity_df


def add_pronoun_corpus(equity_df,pronouns,gender='gender-neutral'):
    """ 
    This function will produce a block of the Equity Evaluation Corpus
    (EEC: http://saifmohammad.com/WebPages/Biases-SA.html) in the form of a
    pandas data frame. 
    
    Parameters
    -----------
    equity_df: dataframe
        A dataframe of the original (or previously modified) EEC
    pronouns: list
        A list of strings that are the pronouns you'd like to generate scentences 
        for, just subjective and objective, e.g., ['They','them']. Note that to fit
        well in the EEC, the subjective should start with a capital letter. 
    gender: str
        A string with the gender you'd like to associate with the pronouns. 
        Since the original EEC has both male and female pronouns, this will most likely
        be 'gender-neutral'
        
    Returns
    --------
    dataframe
        The input EEC plus 144 additional rows. 
    """
    
    # Create a two pronoun blocks of the corpus to modify, one each 
    # for the subjective and objective form. 
    # ------------------------------------------------------------
    male = equity_df['Person'] == 'he'
    subjective_pronoun_block = equity_df[male]
    male2 = equity_df['Person'] == 'him'
    objective_pronoun_block = equity_df[male2]
    
    for index, row in subjective_pronoun_block.iterrows():
        scent = row['Sentence']
        person = 'He'
        scent = scent.replace(person,pronouns[0])
        row['Sentence'] = scent
        row['Person'] = pronouns[0]
        row['Gender'] = 'gender-neutral'
    equity_df = pd.concat([equity_df, subjective_pronoun_block], ignore_index=True)

    for index, row in objective_pronoun_block.iterrows():
        scent = row['Sentence']
        person = 'him'
        scent = scent.replace(person,pronouns[1])
        row['Sentence'] = scent
        row['Person'] = pronouns[1]
        row['Gender'] = 'gender-neutral'
    equity_df = pd.concat([equity_df, objective_pronoun_block], ignore_index=True)
    return equity_df


def add_positive_emotions(equity_df, people_list, emotions_list, base_emotion='joy'):
    """
    Generate new rows for each person with new positive emotions, based on the structure of an existing emotion block.
    """
    base_block = equity_df[equity_df['Emotion'] == base_emotion]

    new_rows = []
    for emotion in emotions_list:
        for _, row in base_block.iterrows():
            emotion_word = row['Emotion word']
            new_row = row.copy()
            new_row['Sentence'] = row['Sentence'].replace(emotion_word, emotion)
            new_row['Emotion'] = "joy"
            new_rows.append(new_row)

    new_df = pd.DataFrame(new_rows)

    # Remove exact duplicates (based on Sentence and Person, Emotion word, etc.)
    new_df = new_df.drop_duplicates(subset=["Sentence", "Person", "Emotion word"])

    equity_df = pd.concat([equity_df, new_df], ignore_index=True)
    print(f"Added {len(new_df)} new rows.")
    print(f"Total rows: {len(equity_df)}")
    return equity_df


# EXAMPLE: Let's add some more groups to the original EEC
# First define the names you'd like to add, e.g.,
# ---------------------------------------------------------
hispanic_women = ['Sofia','Isabella','Valentina','Camila','Valeria','Luciana','Ximena','Mariana','Maria', 'Victoria']
hispanic_men = ['Mateo','Santiago','Matias','Sebastian','Benjamin','Martin','Nicolas','Alejandro','Lucas','Diego']

arabic_men = ['Muhammad','Ali','Amir','Omar','Ahmad','Ibrahim','Yousef','Khalil','Hassan','Khalid']
arabic_women = ['Nur','Laila','Maryam','Fatima','Lena','Amira','Aisha','Zahra','Alia','Salma']

# Load the standard Equity Evaluation Corpus (https://www.svkir.com/resources.html#EEC)
# --------------------------------------------
infile = 'Equity-Evaluation-Corpus.csv'
equity_df = pd.read_csv(infile)
    
equity_df = add_name_corpus(equity_df,hispanic_men,'Hispanic','male')
equity_df = add_name_corpus(equity_df,hispanic_women,'Hispanic','female')
equity_df = add_name_corpus(equity_df,arabic_men,'Arabic','male')
equity_df = add_name_corpus(equity_df,arabic_women,'Arabic','female')


new_positive_emotions = [
    'joyful', 'content', 'satisfied', 'hopeful', 
    'cheerful', 'grateful', 'enthusiastic', 
    'peaceful', 'optimistic', 'uplifted'
]
print(f"current rows: {len(equity_df)}")

equity_df = add_positive_emotions(equity_df, equity_df['Person'].unique(), new_positive_emotions)



Added 35000 new rows.
Total rows: 49400


In [9]:


equity_df.to_csv('expanded_equity_corpus.csv',index=False)