Note: Much of this code was lifted from [the Conversation AI project](https://conversationai.github.io/). In this file, instead of using max DI I want to do each of the DIs against gay and basically plot a bunch of lines on the same axis, di of input and di of output but one line for white, one line for black, one line for jewish etc

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import model_bias_analysis
import matplotlib.pyplot as plt

Read TSVs from file. These are the original data from *Conversation AI*.

* *grouped_annotations* takes the mean of all toxicity ratings of a comment.
* *joined_tox* joins *grouped_annotations* and *comments*.
* We also add a column *binary_tox* to the dataframe *joined_tox*. Here we assign a toxicity rating of 0 or 1 based on whether the mean toxicity rating is above or below 0.5 (or other value determined by threshold_binary_toxicity

Equation DI:
DI(t1,t2) = (probability that comment containing term t1 is labeled toxic) / (probability that comment containing term t2 is labelled toxic)
= a/b

a = # comments containing t1 AND toxic / # comments containing t1
= alpha/beta

In [2]:
def calculate_pairwise_di(df, madlibs_terms, colname):
    
    # We now calculate the DI for each pair for the training data.

    term_toxicity = np.zeros(len(madlibs_terms))

    # we temporarily put -1 in as a placeholder for stuff that has no exacmples of terms with that term.
    # non-binary is one
    for i in range(len(madlibs_terms)):
        try:
            term_toxicity[i] = float((df[(df[madlibs_terms[i]] == True) & (df[colname] == 1)].shape[0]))/  \
            float((df[df[madlibs_terms[i]] == True].shape[0]))
        except ZeroDivisionError:
            term_toxicity[i] = -1 

    return term_toxicity

# term_toxicity = calculate_pairwise_di(train_comments, madlibs_terms, 'binary_tox')
# print(term_toxicity)
# print(len(term_toxicity))


In [3]:
# This finds the max DI between all pairs of identity terms, which for one of our experiments,
# we're claiming is kind of the DI of all terms
# Possibly something we'll take out or come back to once we try more versions of our code. (Neha's working on this!)

def gay_ratio(term_toxicity, madlibs_terms):
    term_vs_gay = []
    top_indexes = ()
    j = 0 # This is the index of gay
    for i in range(len(madlibs_terms)):
        if (i==j or term_toxicity[i] == -1 or term_toxicity[j] == -1 or term_toxicity[j] == 0):
            term_vs_gay.append(1)
        else:
            term_vs_gay.append(term_toxicity[i]/ term_toxicity[j])
                
    return term_vs_gay


In [4]:
# Calculate max DIs on all of the perturbed training datasets
def find_gay_dis(df_comments, madlibs_terms, colname):
    
    term_vs_gay = []
    # So this is a series of DIs, one is t1/gay di, t2/gay di...

    # df_comments['newcol'] = list_perturbation[i]
    term_toxicity = calculate_pairwise_di(df_comments, madlibs_terms, colname)
    term_vs_gay = gay_ratio(term_toxicity, madlibs_terms)
        
    # This is the array of all of the max_dis, and the indexes of the madlibs_terms array that composed that max_di
    return term_vs_gay

In [None]:
def train_and_predict(train_comments, test_comments):

#     # This trains a classifier on n different perturbed datsets
#     d={}
#     for x in range(NUM_PERTURBATIONS):
#         d["clf{0}".format(x)] = Pipeline([
#             ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
#             ('tfidf', TfidfTransformer(norm = 'l2')),
#             ('clf', LogisticRegression()),
#         ])
#         d["clf{0}".format(x)] = d["clf{0}".format(x)].\
#                                     fit(train_comments['comment'], list_perturbations_training[x])
#         d["auc{0}".format(x)] = roc_auc_score(test_comments['binary_tox'], \
#                                     d["clf{0}".format(x)].predict_proba(test_comments['comment'])[:, 1])
#         print('x Test ROC AUC: %.5f' %d["auc{0}".format(x)])
        
#     # Once a classifier is trained, this goes to the test data and creates predictions on test data
#     perturbed_predictions = [] # list, each item is array of predictions. element 0 is 0th perturbation and 
#     # predictions based on that.
#     # each item in the array is a column that indicates 0/1 for predicted not-toxic/toxic

#     for i in range(NUM_PERTURBATIONS):
#         perturbed_predictions.append(d["clf{0}".format(i)].predict(test_comments['comment']))
                                                                                 
#     return perturbed_predictions

    clf = Pipeline([
        ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
        ('tfidf', TfidfTransformer(norm = 'l2')),
        ('clf', LogisticRegression()),
    ])

    clf = clf.fit(train_comments['comment'], train_comments['binary_tox'])
    auc = roc_auc_score(test_comments['binary_tox'], clf.predict_proba(test_comments['comment'])[:, 1])
    print('Test ROC AUC: %.3f' %auc)
    
    test_comments["predicted"] = clf.predict(test_comments['comment'])

In [None]:
madlibs_terms = ['gay', 'homosexual', 'straight', 'black', 'white', 'american', 'jewish', 'old']
array_of_train_dis = []
array_of_test_dis = []

for i in range(30,95,5):
    joined_tox = pd.read_csv('joined_tox'+str(i)+'.csv')
    train_comments = pd.read_csv('train_comments'+str(i)+'.csv')
    test_comments = pd.read_csv('test_comments.csv')
    # madlibs_terms = model_bias_analysis.read_identity_terms('test_comments'+str(i)+'.csv')
    
    training_data_dis = find_gay_dis(train_comments, madlibs_terms, 'binary_tox')
    print(str(i), training_data_dis)
    array_of_train_dis.append((i,training_data_dis))
    
    train_and_predict(train_comments, test_comments)
    
    test_data_dis = find_gay_dis(test_comments, madlibs_terms, 'predicted')
    print(str(i), test_data_dis)
    array_of_test_dis.append((i, test_data_dis))

30 [1, 0.8863430810973771, 0.37764638565706665, 0.4076357128653808, 0.3315111973196967, 0.2503615469544529, 0.3551835548713056, 0.2745952193094558]
Test ROC AUC: 0.953
30 [1, 0.8425760286225403, 0.3087512291052114, 0.27799911465250116, 0.24236605293874017, 0.17018970189701899, 0.24326485729994501, 0.18063773833004604]
35 [1, 0.7782363345405542, 0.29839612085042894, 0.3733125937447919, 0.30149268436868815, 0.19097848233796585, 0.2713950797525943, 0.23209966998134135]


In [None]:
# A graph of the difference between test and training DI on y axis and x axis of threshold
# for t1/gay
x_axis = []
y_axis_t1 = []

for i in range(30,95,5):
    x_axis.append(i)

y_axis_t1 = []
y_axis_t2 = []
y_axis_t5 = []
for i in range(len(array_of_test_dis)):
    training_di = array_of_train_dis[i][1][1]
    test_di = array_of_test_dis[i][1][1]
    y_axis_t1.append(training_di - test_di)
    y_axis_t2.append(array_of_train_dis[i][1][2] - array_of_test_dis[i][1][2])
    y_axis_t5.append(array_of_train_dis[i][1][5] - array_of_test_dis[i][1][5])
plt.plot(x_axis, y_axis_t1, "ro")
plt.plot(x_axis, y_axis_t2, "bo")
plt.plot(x_axis, y_axis_t5, "go")

In [None]:
y_axis_t1 = []
y_axis_t5 = []
for i in range(len(array_of_test_dis)):
    y_axis_t1.append(array_of_test_dis[i][1][1])
    y_axis_t5.append(array_of_test_dis[i][1][5])
plt.plot(x_axis, y_axis_t1, "ro")
plt.plot(x_axis, y_axis_t5, "go")


In [None]:
y_axis_t1 = []
y_axis_t5 = []
for i in range(len(array_of_test_dis)):
    y_axis_t1.append(array_of_train_dis[i][1][1])
    y_axis_t5.append(array_of_train_dis[i][1][5])
plt.plot(x_axis, y_axis_t1, "ro")
plt.plot(x_axis, y_axis_t5, "go")