In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from utils import bin_compas_data, binarize_compas_data

In [2]:
#randomly selects indexes for injecting error
#frac is the fraction of subpopulation selected 
#currently defined for Compas dataset, Blacks being the disadvantaged subpopulation

def get_index(df, frac=0.7):
    sub = df.index[df['Race'] == 'African-American'].tolist()  #& (df['race'] == 'Black') 
    index = random.sample(sub, int(frac*len(sub)))
    total = set(np.arange(0, df.shape[0])) ^ set(sub)
    index += random.sample(total, int((1-frac)*len(total)))
    return index

#swaps values between two columns at chosen indices
def swap(df, col1, col2, index):
    temp = df.iloc[index, df.columns.get_loc(col1)]
    df.iloc[index, df.columns.get_loc(col1)] = df.iloc[index, df.columns.get_loc(col2)]
    df.iloc[index, df.columns.get_loc(col2)] = temp
    return df

#randomly scales values in column at chosen indices
def scale(df, col, index):
    scale_factor = np.random.choice([10, 100, 1000])
    df.iloc[index, df.columns.get_loc(col)] *= scale_factor
    return df

#corrupts column value with gaussian noise at chosen indices
def corrupt(df, col, index):
    stddev = np.std(df[col])
    scale = random.uniform(1, 5)
    noise = np.random.normal(0, scale * stddev, size=len(index))
    df.iloc[index, df.columns.get_loc(col)] += noise
    return df

#drops values at chosen indices and uses a simple imputer
def missing_val(df, col, index, attrib):
    df.iloc[index, df.columns.get_loc(col)] = 1
    """
    #impute the missing values
    if (attrib == 'numeric'):
        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    else:
        imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    df[col] = pd.DataFrame(imp.fit_transform(df[[col]]))"""
    return df
    


In [3]:
file = "dataset/Compas.csv"
df = pd.read_csv(file)
train, test = train_test_split(df, test_size=0.3)



In [16]:
#inject error
df = train.copy().reset_index(drop=True)
idx = get_index(df, 0.8)
df = scale(df, 'Age', idx)
idx = get_index(df, 0.8)
df = corrupt(df, 'Prior', idx)

#save
df.to_csv("corrupt_data/Compas_train_scale_corrupt.csv", index=False)
temp = bin_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_bin_train_scale_corrupt.csv", index=False)
temp = binarize_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_binarized_train_scale_corrupt.csv", index=False)

df = df.append(test)
df.to_csv("corrupt_data/Compas_scale_corrupt.csv", index=False)
temp = bin_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_bin_scale_corrupt.csv", index=False)
temp = binarize_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_binarized_scale_corrupt.csv", index=False)

In [4]:
#inject error
df = train.copy().reset_index(drop=True)
idx = get_index(df, 0.5)
df = swap(df, 'Age', 'Prior', idx)
idx = get_index(df, 0.5)
df = missing_val(df, 'Prior', idx, 'numeric')

#save
df.to_csv("corrupt_data/Compas_train_swap_miss.csv", index=False)
temp = bin_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_bin_train_swap_miss.csv", index=False)
temp = binarize_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_binarized_train_swap_miss.csv", index=False)

df = df.append(test)
df.to_csv("corrupt_data/Compas_swap_miss.csv", index=False)
temp = bin_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_bin_swap_miss.csv", index=False)
temp = binarize_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_binarized_swap_miss.csv", index=False)

In [5]:
#inject error
df = train.copy().reset_index(drop=True)
idx = get_index(df, 0.9)
df = missing_val(df, 'two_year_recid', idx, 'cat')
#idx = get_index(df, 0.7)
#df = missing_val(df, 'Race', idx, 'cat')
idx = get_index(df, 0.2)
df = swap(df, 'Age', 'Prior', idx)

#save
df.to_csv("corrupt_data/Compas_train_miss_outcome.csv", index=False)
temp = bin_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_bin_train_miss_outcome.csv", index=False)
temp = binarize_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_binarized_train_miss_outcome.csv", index=False)

df = df.append(test)
df.to_csv("corrupt_data/Compas_miss_outcome.csv", index=False)
temp = bin_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_bin_miss_outcome.csv", index=False)
temp = binarize_compas_data(df.copy())
temp.to_csv("corrupt_data/Compas_binarized_miss_outcome.csv", index=False)