In [40]:
import pandas as pd
import numpy as np

import utils

In [41]:
df = pd.read_csv("data/compas-scores-two-years/data.csv")

## Dataset: compas-scores-two-years


In [42]:
from utils import display_dataframe_info
display_dataframe_info(df)

DataFrame shape: (7214, 53) (rows, columns)
--------------------------------------------------
First 5 rows of the DataFrame:


Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_jail_in,c_jail_out,c_case_number,c_offense_date,c_arrest_date,c_days_from_compas,c_charge_degree,c_charge_desc,is_recid,r_case_number,r_charge_degree,r_days_from_arrest,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,violent_recid,is_violent_recid,vr_case_number,vr_charge_degree,vr_offense_date,vr_charge_desc,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,0,1,0,0,0,-1.0,2013-08-13 06:03:42,2013-08-14 05:41:20,13011352CF10A,2013-08-13,,1.0,F,Aggravated Assault w/Firearm,0,,,,,,,,,0,,,,,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,0,3,0,0,0,-1.0,2013-01-26 03:45:27,2013-02-05 05:36:53,13001275CF10A,2013-01-26,,1.0,F,Felony Battery w/Prior Convict,1,13009779CF10A,(F3),,2013-07-05,Felony Battery (Dom Strang),,,,1,13009779CF10A,(F3),2013-07-05,Felony Battery (Dom Strang),Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,0,4,0,1,4,-1.0,2013-04-13 04:58:34,2013-04-14 07:02:04,13005330CF10A,2013-04-13,,1.0,F,Possession of Cocaine,1,13011511MM10A,(M1),0.0,2013-06-16,Driving Under The Influence,2013-06-16,2013-06-16,,0,,,,,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,0,8,1,0,1,,,,13000570CF10A,2013-01-12,,1.0,F,Possession of Cannabis,0,,,,,,,,,0,,,,,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,0,1,0,0,2,,,,12014130CF10A,,2013-01-09,76.0,F,arrest case no charge,0,,,,,,,,,0,,,,,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,2,0,1102,0,0


### Handle missing values & irrelevant features

In [43]:
from utils import calculate_nan_percentage_of_grouped_features

missing_stat = calculate_nan_percentage_of_grouped_features(df, "data/compas-scores-two-years/features.yaml")

In [44]:
print("Missing values before preprocessing:")
display(missing_stat)

Missing values before preprocessing:


Unnamed: 0,Category,Feature,Missing_Count,Missing_Percentage
36,Covariate,vr_offense_date,6395,88.647075
28,Covariate,r_offense_date,3743,51.885223
19,Covariate,c_offense_date,1159,16.065983
21,Covariate,c_days_from_compas,22,0.304963
14,Covariate,priors_count,0,0.0
13,Covariate,juv_other_count,0,0.0
10,Covariate,juv_fel_count,0,0.0
6,Covariate,dob,0,0.0
12,Covariate,juv_misd_count,0,0.0
48,Covariate,priors_count.1,0,0.0


Remove some data entries that are inconsistent according to [ProPublica notebook](https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb).

In [45]:
df = df[(df['days_b_screening_arrest'] <= 30) & 
        (df['days_b_screening_arrest'] >= -30) & 
        (df['is_recid'] != -1) & 
        (df['c_charge_degree'] != 'O') & 
        (df['score_text'] != 'N/A') &
        (df.end > df.start)
        ]

Remove duplicated features

In [46]:
df.drop("priors_count.1", inplace=True, axis=1)  # remove column that duplicates "priors_count"

In [47]:
df.drop("decile_score.1", inplace=True, axis=1)  # remove column that duplicates "decile_score"

In [48]:
df.drop("screening_date", inplace=True, axis=1)  # remove column that duplicates "compas_screening_date"

Remove irrelevant features

In [49]:
df.drop(
    [
        "vr_offense_date",
        "violent_recid",
        "is_violent_recid",
        "vr_charge_degree",
        "vr_charge_desc",
        "v_type_of_assessment",
        "v_score_text",
        "v_decile_score",
    ],
    inplace=True,
    axis=1
)  # remove features associated with violent recidivism

In [50]:
df.drop("vr_case_number", inplace=True, axis=1)  # remove features associated with violent recidivism

In [51]:
df.drop(
    [
        "r_case_number",
        "c_case_number",
        "id"
    ],
    inplace=True,
    axis=1
)  # features with unique values that are not useful for prediction

In [52]:
df.drop("type_of_assessment", inplace=True, axis=1)  # constant feature with value "Risk of Recidivism"

Drop features that have less than 20% non-NA values.

In [53]:
df.dropna(inplace=True, axis=1, thresh=df.shape[0] * 0.2)

Remove features that we are not going to use for prediction.

In [54]:
df.drop(
    [
        "start",
        "end",
        "event",
        "name",
        "last",
        "first"
    ],
    inplace=True,
    axis=1
)

In [55]:
missing_stat_after = calculate_nan_percentage_of_grouped_features(df, "data/compas-scores-two-years/features.yaml")
print("Missing values after preprocessing:")
display(missing_stat_after)

Missing values after preprocessing:


Unnamed: 0,Category,Feature,Missing_Count,Missing_Percentage
21,Covariate,r_offense_date,3182,51.622323
14,Covariate,c_offense_date,781,12.670344
2,Covariate,dob,0,0.0
10,Covariate,priors_count,0,0.0
9,Covariate,juv_other_count,0,0.0
8,Covariate,juv_misd_count,0,0.0
6,Covariate,juv_fel_count,0,0.0
15,Covariate,c_days_from_compas,0,0.0
0,Other,compas_screening_date,0,0.0
1,Sensitive,sex,0,0.0


References mentioned below:
- [1] [Notebook](https://github.com/fair-preprocessing/nips2017/blob/master/compas/code/Generate_Compas_Data.ipynb) supplementing [Optimized Data Pre-Processing for Discrimination Prevention](https://arxiv.org/abs/1704.03354) *NeurIPS 2017, 1084 citations by now*  
- [2] Data analysis by [ProPublica](https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb) (authors of the dataset)

In [56]:
df1 = df[['age','c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 'priors_count', 
        'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in',
        'c_jail_out', "c_offense_date", "dob", "juv_other_count", "juv_misd_count",
        "juv_fel_count"]].copy()

In [57]:
df1["c_length_of_stay"] = (pd.to_datetime(df1['c_jail_out']) - pd.to_datetime(df1['c_jail_in'])).apply(lambda x: x.days)

Add age when commited current crime since `df["age"]` is sometimes inconsistent.

In [58]:
df1["c_age"] = ((pd.to_datetime(df.c_offense_date) - pd.to_datetime(df.dob)).dt.days / 365.25).round(0)  # age when commited current crime
df1["c_age"] = df1["c_age"].fillna(df["age"]).astype(int)  # "c_offense_date" has missing values, replace them with values of "age

In [59]:
def categorize_age(age):
    if age < 25:
        return 0
    elif 25 <= age <= 45:
        return 1
    else:  # age > 45
        return 2

df1["c_age_cat"] = df1["c_age"].apply(categorize_age)

In `race`, [1] keeps only African-American and Caucasian and drops others. [2] keeps all races. We keep African-American, Caucasian, and put the rest to "Other" category.

In [60]:
def group_races(race):
    if race in ['African-American', 'Caucasian']:
        return race
    else:
        return 'Other'

df1["race"] = df["race"].apply(group_races)

Remove features not useful for training.

In [61]:
df1.drop(
    [
        "age",
        "age_cat",
        "c_jail_in",
        "c_jail_out",
        "c_offense_date",
        "dob",
        "days_b_screening_arrest"
    ],
    axis=1,
    inplace=True
)

In [62]:
display_dataframe_info(df1)

DataFrame shape: (6164, 14) (rows, columns)
--------------------------------------------------
First 5 rows of the DataFrame:


Unnamed: 0,c_charge_degree,race,score_text,sex,priors_count,decile_score,is_recid,two_year_recid,juv_other_count,juv_misd_count,juv_fel_count,c_length_of_stay,c_age,c_age_cat
0,F,Other,Low,Male,0,1,0,0,0,0,0,0,66,2
1,F,African-American,Low,Male,0,3,1,1,0,0,0,10,31,1
2,F,African-American,Low,Male,4,4,1,1,1,0,0,1,22,0
5,M,Other,Low,Male,0,1,0,0,0,0,0,1,42,1
6,F,Caucasian,Medium,Male,14,6,1,1,0,0,0,6,40,1


### Process categorical features

In [63]:
cat_features_df1 = ["c_charge_degree", "race", "score_text", "sex"]

**Race**  
Race has three categories. To avoid emposing any order, use One-Hot Encoding.

In [64]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False, categories=[["African-American", "Caucasian", "Other"]], drop=['Other'])
race_encoded = ohe.fit_transform(df1[["race"]]).astype(int)
race_df1 = pd.DataFrame(
    race_encoded,
    columns=['race_African-American', 'race_Caucasian'],
    index=df1.index
)
df1 = df1.drop(["race"], axis=1)
df1 = pd.concat([df1, race_df1], axis=1)

**COMPAS score**

Both [1] and [2] make `score_text` feature binary, we'll do the same. Note that this feature is based on `decile_score` variable.

In [65]:
df1["score_text"] = df1["score_text"].apply(
    lambda x: 'Low' if x == 'Low' else 'MediumHigh'
)
df1["score_cat"] = (df1['score_text'] == 'MediumHigh').astype(int)
df1.drop("score_text", axis=1, inplace=True)

**Sex and `c_charge_degree`**   
Encode into binary.

In [66]:
df1["sex"] = (df1['sex'] == 'Male').astype(int)

In [67]:
df1["c_charge_degree"] = (df1["c_charge_degree"] == "F").astype(int)

### Process numerical variables
Apply min-max scaling to numerical features.

In [68]:
from sklearn.preprocessing import MinMaxScaler

columns = ["priors_count", "decile_score", "juv_other_count",
           "juv_misd_count", "juv_fel_count", "c_length_of_stay", "c_age"]

minmax_scaler = MinMaxScaler()
df1[columns] = minmax_scaler.fit_transform(df1[columns])

### Final steps

In [69]:
print("Dataset after preprocessing \n")
display_dataframe_info(df1)

Dataset after preprocessing 

DataFrame shape: (6164, 15) (rows, columns)
--------------------------------------------------
First 5 rows of the DataFrame:


Unnamed: 0,c_charge_degree,sex,priors_count,decile_score,is_recid,two_year_recid,juv_other_count,juv_misd_count,juv_fel_count,c_length_of_stay,c_age,c_age_cat,race_African-American,race_Caucasian,score_cat
0,1,1,0.0,0.0,0,0,0.0,0.0,0.0,0.00125,0.64557,2,0,0,0
1,1,1,0.0,0.222222,1,1,0.0,0.0,0.0,0.01375,0.202532,1,1,0,0
2,1,1,0.105263,0.333333,1,1,0.111111,0.0,0.0,0.0025,0.088608,0,1,0,0
5,0,1,0.0,0.0,0,0,0.0,0.0,0.0,0.0025,0.341772,1,0,0,0
6,1,1,0.368421,0.555556,1,1,0.0,0.0,0.0,0.00875,0.316456,1,0,1,1


Before exporting final dataset, group features to sensitive (s1, s2, ...), covariate (x1, x2, ...), treatment (z1, z2, ...), and target (y1, y2, ...).

In [70]:
# features excluded:
# c_age, we use c_age_cat instead
# score_cat, we use decile_score instead

features_dict = {
    "sensitive": ["sex", "c_age_cat", "race_African-American", "race_Caucasian"],
    "covariate": ["priors_count", "juv_other_count", "juv_misd_count", "juv_fel_count"],
    "treatment": ["c_charge_degree", "decile_score", "c_length_of_stay"],
    "target": ["is_recid", "two_year_recid"]
}

In [71]:
from utils import group_and_rename_columns

df1_final = group_and_rename_columns(df1, features_dict)

In [72]:
print("Final dataset \n")
display_dataframe_info(df1_final)

Final dataset 

DataFrame shape: (6164, 13) (rows, columns)
--------------------------------------------------
First 5 rows of the DataFrame:


Unnamed: 0,s1,s2,s3,s4,x1,x2,x3,x4,z1,z2,z3,y1,y2
0,1,2,0,0,0.0,0.0,0.0,0.0,1,0.0,0.00125,0,0
1,1,1,1,0,0.0,0.0,0.0,0.0,1,0.222222,0.01375,1,1
2,1,0,1,0,0.105263,0.111111,0.0,0.0,1,0.333333,0.0025,1,1
5,1,1,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0025,0,0
6,1,1,0,1,0.368421,0.0,0.0,0.0,1,0.555556,0.00875,1,1


In [73]:
df1_final.to_csv("data/compas-scores-two-years/data_preprocessed.csv", index=False)