In [63]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set_style("darkgrid")
from tqdm import tqdm_notebook as tqdm

### Attributes description(adopted from: https://arxiv.org/pdf/2110.00530.pdf)
| Variable                | Type         | Description                                    |
|-------------------------|--------------|------------------------------------------------|
| sex                     | Binary       | Sex                                            |
| age                     | Numerical    | Age in years                                   |
| race                    | Categorical  | Race                                           |
| juv fel count           | Numerical    | The juvenile felony count                      |
| juv misd count          | Numerical    | The juvenile misdemeanor count                 |
| juv other count         | Numerical    | The juvenile other offenses count              |
| priors count            | Numerical    | The prior offenses count                       |
| c charge degree         | Binary       | Charge degree of original crime                |
| score text              | Categorical  | ProPublica-defined category of decile score    |
| v score text            | Categorical  | ProPublica-defined category of v decile score  |
| two year recid          | Binary       | Whether the defendant is rearrested within two years |
| decile score            | Numerical    | The COMPAS Risk of Recidivism score            |
| days b screening arrest | Numerical    | Days between COMPAS screening and arrest       |
| c jail in               | Categorical  | Jail entry date for original crime             |
| c jail out              | Categorical  | Jail exit date for original crime              |
| c case number           | Categorical  | Case number for original crime                 |
| c offense date          | Categorical  | Offense date of original crime                 |
| c arrest date           | Categorical  | Arrest date for original crime                 |
| c days from compas      | Numerical    | Days between COMPAS screening and offense date |
| c charge desc           | Categorical  | Description of charge for original crime       |
| is recid                | Binary       | Indicator of recidivation                      |
| r charge degree         | Categorical  | Charge degree of follow-up crime               |
| r days from arrest      | Numerical    | Days between follow-up crime and arrest        |
| r offense date          | Categorical  | Date of follow-up crime                        |
| r charge desc           | Categorical  | Description of charge for follow-up crime      |
| r jail in               | Categorical  | Jail entry date for follow-up crime            |
| r jail out              | Categorical  | Jail exit date for follow-up crime             |
| is violent recid        | Binary       | Indicator of violent follow-up crime           |
| vr case number          | Categorical  | Case number for violent follow-up crime        |
| vr charge degree        | Categorical  | Charge degree for violent follow-up crime      |
| vr offense date         | Categorical  | Date of offense for violent follow-up crime    |
| vr charge desc          | Categorical  | Description of charge for violent follow-up    |
| type of assessment      | Categorical  | Type of COMPAS score for decile score          |
| v type of assessment    | Categorical  | Type of COMPAS score for v decile score        |
| v decile score          | Numerical    | COMPAS Risk of Violence score                  |
| v screening date        | Categorical  | Date on which v decile score was given         |
| in custody              | Categorical  | Custody entry date                             |
| out custody             | Categorical  | Custody exit date                              |

The types of charge degrees: F - felony - more serious crimes - heavy penalties, including improsonment for more than one year, M - misdemeanor - less serious crimes - fines or imprisonment for less than one year.  


In [28]:
data = pd.read_csv("../row_data/compas-scores-two-years.csv")

In [19]:
data = data.drop(columns=["sex"])


Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

### Handling missing values / removing unusable columns

In [20]:
# Remove personal info
data = data.drop(columns=["id", "name", "first", "last", "compas_screening_date", "dob", "r_case_number", "c_case_number",  "vr_case_number"])

In [21]:
# Remove duplicated columns 
data = data.drop(columns=["priors_count.1", "decile_score.1", "screening_date"])

In [22]:
# Remove unknown columns 
data = data.drop(columns=["start", "end", "event"])

In [23]:
# Remove screening data 
data = data.drop(columns=["days_b_screening_arrest", "v_screening_date"])

In [24]:
# Remove charge description
data = data.drop(columns=["c_charge_desc", "r_charge_desc", "vr_charge_desc"])

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   sex                   7214 non-null   object 
 1   age                   7214 non-null   int64  
 2   age_cat               7214 non-null   object 
 3   race                  7214 non-null   object 
 4   juv_fel_count         7214 non-null   int64  
 5   decile_score          7214 non-null   int64  
 6   juv_misd_count        7214 non-null   int64  
 7   juv_other_count       7214 non-null   int64  
 8   priors_count          7214 non-null   int64  
 9   c_jail_in             6907 non-null   object 
 10  c_jail_out            6907 non-null   object 
 11  c_offense_date        6055 non-null   object 
 12  c_arrest_date         1137 non-null   object 
 13  c_days_from_compas    7192 non-null   float64
 14  c_charge_degree       7214 non-null   object 
 15  is_recid             

In [26]:
# Dates processing 
data["custody_days"] = (pd.to_datetime(data['out_custody']) - pd.to_datetime(data['in_custody'])).dt.days
data["r_jail_days"] = (pd.to_datetime(data['r_jail_out']) - pd.to_datetime(data['r_jail_in'])).dt.days
data["c_jail_days"] = (pd.to_datetime(data['c_jail_out']) - pd.to_datetime(data['c_jail_in'])).dt.days
data = data.drop(columns=["in_custody", "out_custody", "r_jail_in", "r_jail_out", "c_jail_in", "c_jail_out"])

In [27]:
# Processing of missing and < 1 day values
data.loc[data["custody_days"].isna(), "custody_days"] = -1
data["custody_days"] += 1
data.loc[data["r_jail_days"].isna(), "r_jail_days"] = -1
data["r_jail_days"] += 1
data.loc[data["c_jail_days"].isna(), "c_jail_days"] = -1
data["c_jail_days"] += 1

In [28]:
data[[column for column in data.columns if "date" in column]]

Unnamed: 0,c_offense_date,c_arrest_date,r_offense_date,vr_offense_date
0,2013-08-13,,,
1,2013-01-26,,2013-07-05,2013-07-05
2,2013-04-13,,2013-06-16,
3,2013-01-12,,,
4,,2013-01-09,,
...,...,...,...,...
7209,2013-11-22,,,
7210,2014-01-31,,,
7211,2014-01-13,,,
7212,2014-03-08,,,


In [29]:
# Check columns with missing values
data[data.columns[data.isna().any()].tolist()].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   c_offense_date      6055 non-null   object 
 1   c_arrest_date       1137 non-null   object 
 2   c_days_from_compas  7192 non-null   float64
 3   r_charge_degree     3471 non-null   object 
 4   r_days_from_arrest  2316 non-null   float64
 5   r_offense_date      3471 non-null   object 
 6   violent_recid       0 non-null      float64
 7   vr_charge_degree    819 non-null    object 
 8   vr_offense_date     819 non-null    object 
dtypes: float64(3), object(6)
memory usage: 507.4+ KB


In [30]:
# Remove colum with all null values 
data = data.drop(columns=["violent_recid"])

In [31]:
# Dates between crimes processing
data["vr_days_from_arrest"] = (pd.to_datetime(data['vr_offense_date']) - pd.to_datetime(data['c_arrest_date'])).dt.days
data["vr_days_from_offense"] = (pd.to_datetime(data['vr_offense_date']) - pd.to_datetime(data['c_offense_date'])).dt.days
data["r_days_from_offense"] = (pd.to_datetime(data['r_offense_date']) - pd.to_datetime(data['c_offense_date'])).dt.days

data = data.drop(columns=["c_offense_date", "c_arrest_date", "r_offense_date", "vr_offense_date"])


data.loc[data["vr_days_from_arrest"].isna(), "vr_days_from_arrest"] = -1
data["vr_days_from_arrest"] += 1
data.loc[data["vr_days_from_offense"].isna(), "vr_days_from_offense"] = -1
data["vr_days_from_offense"] += 1
data.loc[data["r_days_from_offense"].isna(), "r_days_from_offense"] = -1
data["r_days_from_offense"] += 1
data.loc[data["r_days_from_arrest"].isna(), "r_days_from_arrest"] = -1
data["r_days_from_arrest"] += 1

In [32]:
data[data.columns[data.isna().any()].tolist()].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   c_days_from_compas  7192 non-null   float64
 1   r_charge_degree     3471 non-null   object 
 2   vr_charge_degree    819 non-null    object 
dtypes: float64(1), object(2)
memory usage: 169.2+ KB


In [33]:
# Remove days from compas
data = data.drop(columns=["c_days_from_compas"])

In [34]:
# Remove types of assesment
data = data.drop(columns=["type_of_assessment", "v_type_of_assessment"])

### Columns processing and encoding

In [35]:
print(data[["age_cat"]].value_counts())
age_encoding = {
    "25 - 45": 2, 
    "Greater than 45": 3, 
    "Less than 25": 1
}
data["age"] = data["age_cat"].replace(age_encoding)
data = data.drop(columns=["age_cat"])

age_cat        
25 - 45            4109
Greater than 45    1576
Less than 25       1529
Name: count, dtype: int64


  data["age"] = data["age_cat"].replace(age_encoding)


In [36]:
print(data["race"].value_counts())
data.loc[data["race"] != "African-American", "race"] = 2
data.loc[data["race"] == "African-American", "race"] = 1
print(data["race"].value_counts())

race
African-American    3696
Caucasian           2454
Hispanic             637
Other                377
Asian                 32
Native American       18
Name: count, dtype: int64
race
1    3696
2    3518
Name: count, dtype: int64


In [37]:
data["juv_fel_count"].value_counts()

juv_fel_count
0     6932
1      189
2       51
3       18
4       12
5        5
8        2
6        2
9        1
20       1
10       1
Name: count, dtype: int64

In [38]:
# Binarize juv_fel_count
data.loc[data["juv_fel_count"] > 0, "juv_fel_count"] = 1
data["juv_fel_count"].value_counts()

juv_fel_count
0    6932
1     282
Name: count, dtype: int64

In [39]:
data["juv_other_count"].value_counts()

juv_other_count
0     6691
1      368
2       98
3       33
4       15
5        4
7        2
17       1
9        1
6        1
Name: count, dtype: int64

In [40]:
# Binarize juv_other_count
data.loc[data["juv_other_count"] > 0, "juv_other_count"] = 1
data["juv_other_count"].value_counts()

juv_other_count
0    6691
1     523
Name: count, dtype: int64

In [41]:
data["juv_misd_count"].value_counts()

juv_misd_count
0     6799
1      291
2       72
3       29
4        9
5        5
6        4
8        3
12       1
13       1
Name: count, dtype: int64

In [42]:
# Binarize juv_other_count
data.loc[data["juv_misd_count"] > 0, "juv_misd_count"] = 1
data["juv_misd_count"].value_counts()

juv_misd_count
0    6799
1     415
Name: count, dtype: int64

In [43]:
data[["decile_score", "score_text"]].value_counts()
data = data.drop(columns=["score_text"])

In [44]:
data[["v_decile_score", "v_score_text"]].value_counts()
data = data.drop(columns=["v_score_text"])

In [46]:
# Encode charge type 
print(data[["c_charge_degree"]].value_counts())
c_charge_encoding = {
    "F": 2, 
    "M": 1
}
data["c_charge_degree"] = data["c_charge_degree"].replace(c_charge_encoding)
print(data[["c_charge_degree"]].value_counts())

c_charge_degree
2                  4666
1                  2548
Name: count, dtype: int64
c_charge_degree
2                  4666
1                  2548
Name: count, dtype: int64


In [47]:
# Encode r charge type
print(data[["r_charge_degree"]].value_counts())
r_charge_encoding = {
    '(F3)': 2, '(M1)': 1, '(F2)': 2, '(M2)': 1, '(MO3)': 1,
    '(F1)': 2, '(F6)': 2, '(F7)': 2, '(CO3)': 1, '(F5)': 2
}
data["r_charge_degree"] = data["r_charge_degree"].replace(r_charge_encoding)
data.loc[data["r_charge_degree"].isna(), "r_charge_degree"] = 0
print(data[["r_charge_degree"]].value_counts())

r_charge_degree
(M1)               1201
(M2)               1107
(F3)                892
(F2)                168
(F1)                 51
(MO3)                39
(F7)                  7
(F6)                  3
(CO3)                 2
(F5)                  1
Name: count, dtype: int64
r_charge_degree
0.0                3743
1.0                2349
2.0                1122
Name: count, dtype: int64


  data["r_charge_degree"] = data["r_charge_degree"].replace(r_charge_encoding)


In [48]:
# Encode vr charge type
print(data[["vr_charge_degree"]].value_counts())
data["vr_charge_degree"] = data["vr_charge_degree"].replace(r_charge_encoding)
data.loc[data["vr_charge_degree"].isna(), "vr_charge_degree"] = 0
print(data[["vr_charge_degree"]].value_counts())

vr_charge_degree
(M1)                344
(F3)                228
(F2)                162
(F1)                 38
(M2)                 19
(F7)                 18
(MO3)                 5
(F6)                  4
(F5)                  1
Name: count, dtype: int64
vr_charge_degree
0.0                 6395
2.0                  451
1.0                  368
Name: count, dtype: int64


  data["vr_charge_degree"] = data["vr_charge_degree"].replace(r_charge_encoding)


In [49]:
print(data[["sex"]].value_counts())
gender_encoding = {
    "Male": 1, 
    "Female": 2
}
data["sex"] = data["sex"].replace(gender_encoding)

sex   
Male      5819
Female    1395
Name: count, dtype: int64


  data["sex"] = data["sex"].replace(gender_encoding)


In [50]:
data.to_csv("../processed_data/COMPAS.csv", index=False)

### Measure initial fairness

In [4]:
import sys

sys.path.insert(0, "..")
from fair_metrics.group_fairness import disparity_ratio, social_benefit
from fair_metrics.individual_fairness import equalized_odds

In [31]:
data = pd.read_csv("../processed_data/COMPAS.csv")

In [32]:
data

Unnamed: 0,sex,age,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,c_charge_degree,is_recid,...,is_violent_recid,vr_charge_degree,v_decile_score,two_year_recid,custody_days,r_jail_days,c_jail_days,vr_days_from_arrest,vr_days_from_offense,r_days_from_offense
0,1,3,2,0,1,0,0,0,2,0,...,0,0.0,1,0,8.0,0.0,1.0,0.0,0.0,0.0
1,1,2,1,0,3,0,0,0,2,1,...,1,2.0,1,1,11.0,0.0,11.0,0.0,161.0,161.0
2,1,1,1,0,4,0,1,4,2,1,...,0,0.0,3,1,1.0,1.0,2.0,0.0,0.0,65.0
3,1,1,1,0,8,1,0,1,2,0,...,0,0.0,6,0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2,2,0,1,0,0,2,2,0,...,0,0.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,1,1,1,0,7,0,0,0,2,0,...,0,0.0,5,0,3.0,0.0,2.0,0.0,0.0,0.0
7210,1,1,1,0,3,0,0,0,2,0,...,0,0.0,5,0,3.0,0.0,2.0,0.0,0.0,0.0
7211,1,3,2,0,1,0,0,0,2,0,...,0,0.0,1,0,2.0,0.0,2.0,0.0,0.0,0.0
7212,2,2,1,0,2,0,0,3,1,0,...,0,0.0,2,0,2.0,0.0,2.0,0.0,0.0,0.0


In [14]:
data.columns

Index(['sex', 'age', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'c_charge_degree', 'is_recid',
       'r_charge_degree', 'r_days_from_arrest', 'is_violent_recid',
       'vr_charge_degree', 'v_decile_score', 'two_year_recid', 'custody_days',
       'r_jail_days', 'c_jail_days', 'vr_days_from_arrest',
       'vr_days_from_offense', 'r_days_from_offense'],
      dtype='object')

Change "two_year_recid" column with opposite values as in most setups, 1 - desired outcome

In [7]:
print(data["two_year_recid"].value_counts())
data["two_year_recid"] += 1
data["two_year_recid"] %= 2
print(data["two_year_recid"].value_counts())

two_year_recid
0    3963
1    3251
Name: count, dtype: int64
two_year_recid
1    3963
0    3251
Name: count, dtype: int64


In [8]:
disparity_ratio(data, "two_year_recid", "race", 2, 1)

0.7880777164102811

Measurement of social benefit involve the ML prediction algorithm, in our case - Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [10]:
train, test = train_test_split(data, test_size=0.15, stratify=data["two_year_recid"])
train.to_csv("../processed_data/COMPAS_train.csv")
test.to_csv("../processed_data/COMPAS_test.csv")

In [11]:
train_X, train_y = train.drop(columns=["two_year_recid"]), train["two_year_recid"]
test_X, test_y = test.drop(columns=["two_year_recid"]), test["two_year_recid"]

In [12]:
# Use default version of classifier from library
# Fix random_state for reproducibility
clf = RandomForestClassifier(random_state=42)
clf.fit(train_X, train_y)

In [13]:
social_benefit(test, "two_year_recid", clf.predict)

0.9824561403508771

In [15]:
data["decile_score"].value_counts()

decile_score
1     1440
2      941
4      769
3      747
5      681
6      641
7      592
8      512
9      508
10     383
Name: count, dtype: int64

In [16]:
equalized_odds(data, "two_year_recid", "race", "decile_score", lambda x: x < 5, 2, 1)

0.9135673049949062

In [None]:
equalized_odds(data, "two_year_recid", "race", "decile_score", lambda x: x < 5, 2, 1)

In [17]:
data["priors_count"].value_counts()

priors_count
0     2150
1     1397
2      840
3      568
4      401
5      334
6      242
7      210
8      187
9      149
10     110
11      98
13      77
12      73
14      56
15      51
16      39
17      36
19      30
18      26
20      23
21      22
22      21
23      16
24      11
25       9
26       7
27       7
28       7
29       5
33       3
30       2
38       2
31       2
36       1
37       1
35       1
Name: count, dtype: int64

In [18]:
equalized_odds(data, "two_year_recid", "race", "priors_count", lambda x: x < 2, 2, 1)

0.8757426557721325

In [20]:
data["c_charge_degree"].value_counts()

c_charge_degree
2    4666
1    2548
Name: count, dtype: int64

In [23]:
equalized_odds(data, "two_year_recid", "race", "c_charge_degree", lambda x: x == 2, 2, 1)

0.7991186741884158