In [1]:
import numpy as np
import pandas as pd
import utils
from sklearn.model_selection import train_test_split

In [2]:
cols = ["days_b_screening_arrest",
    "is_recid",
    "c_charge_degree",
    "c_jail_out",
    "c_jail_in",
    "age_cat",
    "race",
    "sex",
    "two_year_recid",
    ]
recid = pd.read_csv("./compas_recid.csv", usecols=cols+["score_text"])
violent_recid = pd.read_csv("./compas_violent_recid.csv", usecols=cols+["v_score_text"])

In [3]:
def compas_cleaning(df, score_factor):
    new_df = df.dropna()
    new_df = new_df[(new_df["days_b_screening_arrest"]<=30)&
                    (new_df["days_b_screening_arrest"]>=-30)&
                    (new_df["is_recid"]!=-1)&
                    (new_df["c_charge_degree"]!="O")]
    new_df["length_of_stay"] = ((pd.to_datetime(df["c_jail_out"])-pd.to_datetime(df["c_jail_in"])).dt.days)
    new_df["length_of_stay"] = new_df["length_of_stay"].astype(int)
    
    #Perhaps limit dataset to only black and white participants
    new_df["is_Caucasian"] = new_df["race"].apply(lambda x: 1 if x=="Caucasian" else 0)
    new_df.drop(labels = ["c_jail_out", "c_jail_in", "days_b_screening_arrest", "is_recid", "race"],axis = 1, inplace = True)
    new_df[score_factor] = new_df[score_factor].apply(lambda x: 0 if x=="High" else 1)
    
    new_df = pd.get_dummies(new_df, 
                            columns = ["c_charge_degree",
                                        "age_cat",
                                        "sex"],
                            drop_first=True)
    return new_df

    

In [4]:
recid = compas_cleaning(recid, "score_text")
violent_recid = compas_cleaning(violent_recid, "v_score_text")

In [5]:
def splitter(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
    return {"x_train":x_train, "x_test": x_test, "y_train":y_train, "y_test":y_test}

In [6]:
print("AAA")

AAA


In [7]:
#Missingness in the sensitive variable Race, and also in charge degree and length of stay
RESPONSE = "score_text"
a = splitter(recid.drop(RESPONSE, axis = 1), recid[RESPONSE])
x = a["x_train"]
x[RESPONSE] = a["y_train"]
print(x.head())

      two_year_recid  length_of_stay  is_Caucasian  c_charge_degree_M  \
5866               1               0             1                  1   
3629               0               1             0                  0   
1402               1               0             0                  0   
6117               1              33             0                  0   
4995               1               0             0                  1   

      age_cat_Greater than 45  age_cat_Less than 25  sex_Male  score_text  
5866                        0                     1         1           1  
3629                        0                     1         0           1  
1402                        0                     0         1           0  
6117                        1                     0         1           0  
4995                        1                     0         1           1  


In [8]:

percentiles = [i for i in range(1, 16)]+[j for j in range(20, 60, 10)]
for i, frames in enumerate([recid]):#, violent_recid]):
    if i == 0:
        RESPONSE = "score_text"
    else:
        RESPONSE = "v_score_text"
    for sensitive in ["is_Caucasian", "sex_Male"]:
        for miss in ["c_charge_degree_M", "is_Caucasian", "sex_Male", "two_year_recid"]:
            data = splitter(frames.drop(RESPONSE, axis = 1), frames[RESPONSE])
            x = data["x_train"]
            x[RESPONSE] = data["y_train"]

            y = data["x_test"]
            y[RESPONSE] = data["y_test"]
            try:
                res = utils.test_bench(train = x,test = y, pred = RESPONSE, missing = miss, sensitive=sensitive,
                    percentiles = percentiles)
            except:
                print(data.head())
                print("Responses:", x[RESPONSE].sum(), y[RESPONSE].sum())

            try:
                utils.plotting_cf(["log_reg", "rf_cat", "svm", "knn"],["cca", "mice_def", "mean"], res)
                utils.plotting_others(res)
            except Exception as e:
                print(e)

            

100%|██████████| 4/4 [02:16<00:00, 34.08s/it]
100%|██████████| 4/4 [02:16<00:00, 34.12s/it]
100%|██████████| 4/4 [02:12<00:00, 33.17s/it]
100%|██████████| 4/4 [02:20<00:00, 35.12s/it]
100%|██████████| 4/4 [02:13<00:00, 33.35s/it]
100%|██████████| 4/4 [02:19<00:00, 34.97s/it]
100%|██████████| 4/4 [02:17<00:00, 34.50s/it]
100%|██████████| 4/4 [02:22<00:00, 35.70s/it]
100%|██████████| 4/4 [01:50<00:00, 27.61s/it]
100%|██████████| 4/4 [02:12<00:00, 33.00s/it]
100%|██████████| 4/4 [02:36<00:00, 39.04s/it]
  0%|          | 0/4 [00:38<?, ?it/s]


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

<Figure size 432x288 with 0 Axes>

In [None]:
#test_bench(train, test, pred: str, missing: str, sensitive: str, pred_var_type: str = "cat"):
#res = utils.test_bench(synth_cat_train, synth_cat_test, "y", "x_2", "x_2")