This file is basically the same as random_perturbations_logdi except generalized to multiple p's. 

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import model_bias_analysis
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina' # comment this out if higher resolution is not needed

In [2]:
# Remember to always check that these files look the way you want before you run this! see data-di-with-gay to regenerate

joined_tox = pd.read_csv('joined_tox.csv')
train_comments = pd.read_csv('train_comments.csv')
test_comments = pd.read_csv('test_comments.csv')
madlibs_terms = ['gay', 'homosexual', 'straight', 'black', 'white', 'american', 'jewish', 'old']

In [3]:
# Generates perturbations
# Basically, creates num_perturbations number of arrays. 
# Each array is the length of the train_comments
# Each item in the array is 0/1 and is PROBABILITY_FLIP flipped from the true value of binary toxicity 

def generate_perturbation_on_training(train_comments, num_perturbations, probability_flip):
    print("probflip", probability_flip)
    
    list_perturbation = []
    rand = []
    tox_tmp = []
    num_flipped = 0
    
    length = len(train_comments.binary_tox.values)
    for j in range(num_perturbations):
        rand = np.random.random(length) # generate a random number (between 0 and 1) for each comment
        tox_tmp = np.copy(train_comments.binary_tox.values) # np.copy(tox_np)
        for i in range(length):
            if rand[i] >= probability_flip: # if random number is greater than 0.5, replace value in array with a random integer from [0, 1]
                tox_tmp[i] = np.random.randint(2)
                num_flipped += 1
        list_perturbation.append(tox_tmp)

    # each item in list_perturbation is a list of 0s and 1s that correspond to the new binary_tox of each variable
    print(num_flipped)
    return list_perturbation

In [4]:
def train_and_predict(train_comments, list_perturbations_training, test_comments, NUM_PERTURBATIONS):

    # This trains a classifier on n different perturbed datsets
    d={}
    for x in range(NUM_PERTURBATIONS):
        d["clf{0}".format(x)] = Pipeline([
            ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
            ('tfidf', TfidfTransformer(norm = 'l2')),
            ('clf', LogisticRegression()),
        ])
        d["clf{0}".format(x)] = d["clf{0}".format(x)].\
                                    fit(train_comments['comment'], list_perturbations_training[x])
        d["auc{0}".format(x)] = roc_auc_score(test_comments['binary_tox'], \
                                    d["clf{0}".format(x)].predict_proba(test_comments['comment'])[:, 1])
        print('x Test ROC AUC: %.5f' %d["auc{0}".format(x)])
        
    # Once a classifier is trained, this goes to the test data and creates predictions on test data
    perturbed_predictions = [] # list, each item is array of predictions. element 0 is 0th perturbation and 
    # predictions based on that.
    # each item in the array is a column that indicates 0/1 for predicted not-toxic/toxic

    for i in range(NUM_PERTURBATIONS):
        perturbed_predictions.append(d["clf{0}".format(i)].predict(test_comments['comment']))
                                                                                 
    return perturbed_predictions

In [5]:
def logDI(df, labels_col, terms):
    # labels_col is true/false of the comment being classified as toxic. ('binary_tox' I believe)
    # terms should be the array of top 8 terms
    
    logDI_arr = np.empty(((len(terms)), len(terms)))
    
    for i in range(len(terms)):
        for j in range(len(terms)):
            # print("******", i)
            logDI_arr[i, j] =  (math.log(len(df[(df[terms[i]]==True) & (df[labels_col]==True)]) / len(df[df[terms[i]]==True])) \
                            - math.log(len(df[(df[terms[j]]==True) & (df[labels_col]==True)]) / len(df[df[terms[j]]==True])))**2
                
    logDI = logDI_arr.sum() / 2
    return logDI

In [6]:
def find_avg_logDI(list_perturbations, df, madlibs_terms):
    # find logDI for each item in list_perturbations_training
    logs = []
    for i in range(len(list_perturbations)):
        namecol = 'perturbation' + str(i)
        df[namecol] = list_perturbations[i]
        logs.append(logDI(df, namecol, madlibs_terms))
    return logs

In [None]:
p_vals = [0.05, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.7, 0.75, 0.80, 0.85, 0.90, 0.95]
logs_training = [] # means
logs_test = [] # means
list_training = [] # perturbations
list_test = [] # perturbations

xs_all_logs = [] # corresponds to below
all_logs_training = [] # not means
all_logs_test = [] # not means
logs_training_list = [] # temp
logs_test_list = [] # temp

NUM_PERTURBATIONS = 10 # This is the number of perturbations we're doing on EACH p-val below
for i in range(len(p_vals)):
    probability_flip = p_vals[i]
    
    list_perturbations_training = generate_perturbation_on_training(train_comments, NUM_PERTURBATIONS, probability_flip)
    list_training.append(list_perturbations_training)
    
    logs_training_list = find_avg_logDI(list_perturbations_training, train_comments, madlibs_terms)
    logs_training.append(np.mean(logs_training_list))
    all_logs_training.append(logs_training_list)
    xs_all_logs.append([p_vals[i]]*NUM_PERTURBATIONS)
    
    perturbed_predictions = train_and_predict(train_comments, list_perturbations_training, test_comments, NUM_PERTURBATIONS)
    list_test.append(perturbed_predictions)
    
    logs_test_list = find_avg_logDI(perturbed_predictions, test_comments, madlibs_terms)
    logs_test.append(np.mean(logs_test_list))
    all_logs_test.append(logs_test_list)
    

probflip 0.05
908637
x Test ROC AUC: 0.57471
x Test ROC AUC: 0.57982


In [None]:
plt.plot(logs_training, logs_test, "ro")
plt.title("Relationship between Training and Test Data log DI for various ps")
plt.xlabel("Training Data log DI of two terms")
plt.ylabel("Test Data log DI of two terms")

In [None]:
for i in range(len(p_vals)):
    p_vals[i] = 1 - p_vals[i]
print(p_vals)

In [None]:
plt.plot(p_vals, logs_training, "ro")
plt.title("Map of ps to training logs")
plt.xlabel("p's")
plt.ylabel("log DI of training")

In [None]:
plt.plot(p_vals, logs_training)
plt.title("Map of ps to training logs")
plt.xlabel("p's")
plt.ylabel("log DI of training")

In [None]:
plt.plot(p_vals, logs_test, "ro")
plt.title("Map of ps to test logs")
plt.xlabel("p's")
plt.ylabel("log DI of test")

In [None]:
plt.plot(p_vals, logs_test)
plt.title("Map of ps to test logs")
plt.xlabel("p's")
plt.ylabel("log DI of test")

In [None]:
#xs_all_logs = []
# print(ys_all_logs)
#for i in range(len(ys_all_logs)):
#    xs_all_logs.append(ys_all_logs[i])
#print(xs_all_logs)
#print(all_logs_training)

xs_all_logs = np.concatenate(xs_all_logs, axis=None)
#print(xs_all_logs)
#print(len(xs_all_logs))
print(xs_all_logs)
for i in range(len(xs_all_logs)):
    xs_all_logs[i] = 1 - xs_all_logs[i]
print(xs_all_logs)

In [None]:
all_logs_training = np.concatenate(all_logs_training, axis = None)
print(all_logs_training)
print(xs_all_logs)
all_logs_test = np.concatenate(all_logs_test, axis = None)

In [None]:
plt.figure(figsize=(15,10))
# blue is training
plt.plot(xs_all_logs, all_logs_training, "bo", alpha = .3, label = "Training Mean Value")
plt.plot(p_vals, logs_training, "blue", label = "Training")

# red will be test
plt.plot(p_vals, logs_test, "red", alpha = .3, label = "Test Mean Value")
plt.plot(xs_all_logs, all_logs_test, "ro", label = "Test")

plt.plot()
plt.title("Disparate Impact of Training Data Falls as p Increases", fontsize=20)
plt.xlabel("Percentage (p) of comments with synthetic labels", fontsize=20)
plt.ylabel("LogDI of training data", fontsize=20)
plt.savefig('DI_tr_data_p_incr.png', bbox_inches='tight')
plt.legend()

In [None]:
# plt.plot(ys_all_logs, all_logs_test)
# plt.title("Map of not-avg ps to test logs")
# plt.xlabel("p's")
# plt.ylabel("log DI of test")

In [None]:
df_perturbed_training = pd.DataFrame(list_training)
df_perturbed_training = df_perturbed_training.T
df_perturbed_training.columns=p_vals
df_perturbed_training.to_csv("perturbed_training2.csv", index=False)
df_perturbed_test = pd.DataFrame(list_test)
df_perturbed_test = df_perturbed_test.T
df_perturbed_test.columns=p_vals
df_perturbed_training.to_csv("perturbed_test2.csv", index=False)

In [None]:
df_perturbed_training.head()

In [None]:
plt.figure(figsize=(12, 8))
plt.plot(p_vals, logs_test, "ro", label='test Dataset')
plt.plot(p_vals, logs_training, "o", label='Training Dataset')
plt.legend()