In [134]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import utils

In [135]:
def confusion_matrix(true, pred):
    # Assumes numpy arrays(
    try:
        tpr = sum([1 if t == p and p == 1 else 0 for t,
                  p in zip(true, pred)])/(sum(true))
    except:
        tpr = 0
        #print("true", sum(true))
        #print("pred", sum(pred))

    try:
        tnr = sum([1 if t == p and p == 0 else 0 for t,
                  p in zip(true, pred)])/(len(true)-sum(true))
    except:
        tnr = 0
        #print("true", sum(true))
        #print("pred", sum(pred))
    fpr = 1-tnr
    fnr = 1-tpr
    #Old return structure. Converted to vanilla dict for json compatibility
    #return pd.DataFrame({"Predicted true": [tpr, fpr],
    #                     "Predicted false": [fnr, tnr]}, index=["Is true", "Is false"])
    return pd.DataFrame({"Predicted true": [tpr, fpr],
                "Predicted false": [fnr, tnr]}, index = ["Is true", "Is false"])

In [136]:
def compas_cleaning(df):
    new_df = df.dropna()
    new_df = new_df[(new_df["days_b_screening_arrest"]<=30)&
                    (new_df["days_b_screening_arrest"]>=-30)&
                    (new_df["is_recid"]!=-1)&
                    (new_df["c_charge_degree"]!="O")]
    new_df["length_of_stay"] = ((pd.to_datetime(df["c_jail_out"])-pd.to_datetime(df["c_jail_in"])).dt.days)
    new_df["length_of_stay"] = new_df["length_of_stay"].astype(int)
    
    #Perhaps limit dataset to only black and white participants
    new_df["is_Caucasian"] = new_df["race"].apply(lambda x: 1 if x=="Caucasian" else 0)
    new_df.drop(labels = ["c_jail_out", "c_jail_in", "days_b_screening_arrest", "is_recid", "race"],axis = 1, inplace = True)
    if "v_score_text" in new_df.columns:
        new_df.columns = ["score_text" if col == "v_score_text" else col for col in new_df.columns]
    new_df["score_text"] = new_df["score_text"].apply(lambda x: 0 if x=="Low" else 1)
    new_df["sex"] = new_df["sex"].apply(lambda x: 0 if x == "Male" else 1)
    new_df["c_charge_degree"] = new_df["c_charge_degree"].apply(lambda x: 0 if x == "F" else 1)
    new_df = pd.get_dummies(new_df, 
                            columns = ["age_cat"],
                            drop_first=True)
    return new_df

In [137]:
def splitter(data, response= "score_text"):
    x = data.drop(response, axis = 1)
    y = data[response]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
    x_train[response]=y_train
    x_test[response]=y_test
    return {"train":x_train, "test": x_test}


In [138]:
cols = ["days_b_screening_arrest",
"is_recid",
"c_charge_degree",
"c_jail_out",
"c_jail_in",
"age_cat",
"race",
"sex",
"two_year_recid",
]
recid = pd.read_csv("./compas_recid.csv", usecols=cols+["score_text"])
print(recid.nunique())
violent_recid = pd.read_csv("./compas_violent_recid.csv", usecols=cols+["v_score_text"])

sex                           2
age_cat                       3
race                          6
days_b_screening_arrest     423
c_jail_in                  6907
c_jail_out                 6880
c_charge_degree               2
is_recid                      2
score_text                    3
two_year_recid                2
dtype: int64


In [139]:
recid = splitter(compas_cleaning(recid))
violent_recid = splitter(compas_cleaning(violent_recid))

In [140]:
model = LogisticRegression(random_state=0, max_iter=500)
model = model.fit(recid["train"].drop("score_text", axis = 1), recid["train"]["score_text"])
preds = model.predict(recid["test"].drop("score_text", axis = 1))
print(confusion_matrix(recid["test"]["score_text"], preds))

          Predicted true  Predicted false
Is true         0.646995         0.353005
Is false        0.282531         0.717469


In [141]:
new_data = pd.read_csv("./propublica_data_for_fairml.csv")

In [142]:
new_data = splitter(new_data, response = "score_factor")

In [143]:
model = LogisticRegression(random_state=0, max_iter=500)
model = model.fit(new_data["train"].drop("score_factor", axis = 1), new_data["train"]["score_factor"])
preds = model.predict(new_data["test"].drop("score_factor", axis = 1))
print(confusion_matrix(new_data["test"]["score_factor"], preds))

          Predicted true  Predicted false
Is true         0.683060         0.316940
Is false        0.194296         0.805704


In [144]:
formatted = utils.load_compas_alt()

In [145]:
len(formatted)

6172

In [146]:
formatted.std()

priors_count                  4.743770
two_year_recid                0.498022
crime_factor                  0.479086
gender_factor                 0.392629
score_factor                  0.497086
is_Caucasian                  0.473994
age_factor_Greater than 45    0.406981
age_factor_Less than 25       0.413087
dtype: float64

In [147]:
formatted.mean()

priors_count                  3.246436
two_year_recid                0.455120
crime_factor                  0.356773
gender_factor                 0.809624
score_factor                  0.445723
is_Caucasian                  0.340732
age_factor_Greater than 45    0.209494
age_factor_Less than 25       0.218244
dtype: float64

In [148]:
def sigmoid(x, alpha):
    z = np.exp(-x+alpha)
    sig = 1 / (1 + z)
    return sig
size = 6000
priors_count = np.round(np.abs(np.random.uniform(3.246, 4.743, size = size)))
two_year_recid = np.random.binomial(1,0.455, size = size)
is_Caucasian = np.random.binomial(1,0.34, size = size)
crime_factor = np.random.binomial(1,0.3567, size = size)
age_greater_than_45 = np.random.binomial(1,0.209, size = size)
age_less_than_25  = np.random.binomial(1,0.218, size = size)
sex_male = np.random.binomial(1,0.809, size = size)
score_text = np.around(sigmoid((priors_count*(0.1)+two_year_recid+is_Caucasian+crime_factor+
age_greater_than_45+age_less_than_25+sex_male), alpha = 3)).astype(int)
synth_cat = pd.DataFrame({"score_factor": score_text, "priors_count":priors_count,
    "two_year_recid":two_year_recid, "is_Caucasian":is_Caucasian, "crime_factor":crime_factor,
    "age_factor_greater_than_45":age_greater_than_45, "age_factor_less_than_25":age_less_than_25, "sex_Male":sex_male})

In [149]:
print(synth_cat["score_factor"].sum()/len(synth_cat["score_factor"]))

0.44033333333333335


In [150]:
formatted = splitter(formatted, response = "score_factor")

In [151]:
model = LogisticRegression(random_state=0, max_iter=500)
model = model.fit(formatted["train"].drop("score_factor", axis = 1), formatted["train"]["score_factor"])
preds = model.predict(formatted["test"].drop("score_factor", axis = 1))
print(confusion_matrix(formatted["test"]["score_factor"], preds))

          Predicted true  Predicted false
Is true         0.692896         0.307104
Is false        0.207665         0.792335


In [160]:
#Test to check if MCAR performance is still high even in the presence of missing data'
formatted["train"]["bin"] = np.random.binomial(n=1, p=0.40, size = len(formatted["train"]))
new_test = formatted.copy()
new_test["train"] = new_test["train"][new_test["train"]["bin"] == 1]
new_test["train"] = new_test["train"].drop("bin", axis = 1)

In [161]:
model = LogisticRegression(random_state=0, max_iter=500)
model = model.fit(new_test["train"].drop("score_factor", axis = 1), new_test["train"]["score_factor"])
preds = model.predict(new_test["test"].drop("score_factor", axis = 1))
print(confusion_matrix(new_test["test"]["score_factor"], preds))

          Predicted true  Predicted false
Is true         0.719126         0.280874
Is false        0.221925         0.778075


In [174]:
mcar = np.random.binomial(n=1, p=0.5, size=len(formatted["train"]))
formatted["train"]["missing"] = [1 if m == 1 else 0 for m in mcar]
formatted["train"]["gender_factor"] = formatted["train"]["gender_factor"].mask(formatted["train"]["missing"] == 1,
                                            other=np.nan)
formatted["train"] = formatted["train"].drop("missing", axis=1)

In [170]:
formatted["train"]["missing"] = [1 if m == 1 else 0 for m in mcar]
formatted["train"]["gender_factor"].mask(formatted["train"]["missing"] == 1,
                                            other=np.nan)

5025    1.0
3098    0.0
1187    NaN
5238    NaN
4270    NaN
       ... 
3772    1.0
5191    NaN
5226    NaN
5390    NaN
860     1.0
Name: gender_factor, Length: 4135, dtype: float64

In [175]:
formatted["train"].isna().sum()

priors_count                     0
two_year_recid                   0
crime_factor                     0
gender_factor                 2044
is_Caucasian                     0
age_factor_Greater than 45       0
age_factor_Less than 25          0
score_factor                     0
bin                              0
dtype: int64

In [176]:
len(formatted["train"])-len(formatted["train"].dropna())

2044

In [None]:
a