In [1]:
import pandas as pd
import numpy as np

import utils

In [2]:
df = pd.read_csv("data/compas-scores-two-years/data.csv")

**TODO:**
1. In missing values stat display features from "other" category
1. How to preprocess variable indicating date?
2. How to handle missing values in recidivism features? They indicate that there was no recidivism?


## Dataset: compas-scores-two-years


In [66]:
from utils import display_dataframe_info
display_dataframe_info(df)

DataFrame shape: (7214, 53) (rows, columns)
--------------------------------------------------
First 5 rows of the DataFrame:


Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


### Most impactful treatment features
TODO: make a list of them

### Handle missing values & irrelevant features

In [67]:
from utils import calculate_nan_percentage_of_grouped_features

missing_stat = calculate_nan_percentage_of_grouped_features(df, "data/compas-scores-two-years/features.yaml")

In [68]:
print("Missing values before preprocessing:")
display(missing_stat)

Missing values before preprocessing:


Unnamed: 0,Category,Feature,Missing_Count,Missing_Percentage
36,Covariate,vr_offense_date,6395,88.647075
28,Covariate,r_offense_date,3743,51.885223
19,Covariate,c_offense_date,1159,16.065983
15,Covariate,days_b_screening_arrest,307,4.255614
21,Covariate,c_days_from_compas,22,0.304963
14,Covariate,priors_count,0,0.0
13,Covariate,juv_other_count,0,0.0
10,Covariate,juv_fel_count,0,0.0
6,Covariate,dob,0,0.0
12,Covariate,juv_misd_count,0,0.0


Remove some data entries that are inconsistent according to [ProPublica notebook](https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb).

In [69]:
df = df[(df['days_b_screening_arrest'] <= 30) & 
        (df['days_b_screening_arrest'] >= -30) & 
        (df['is_recid'] != -1) & 
        (df['c_charge_degree'] != 'O') & 
        (df['score_text'] != 'N/A') &
        (df.end > df.start)
        ]

Remove duplicated features

In [70]:
df.drop("priors_count.1", inplace=True, axis=1)  # remove column that duplicates "priors_count"

In [71]:
df.drop("decile_score.1", inplace=True, axis=1)  # remove column that duplicates "decile_score"

In [72]:
df.drop("screening_date", inplace=True, axis=1)  # remove column that duplicates "compas_screening_date"

Remove irrelevant features

In [73]:
df.drop(
    [
        "vr_offense_date",
        "violent_recid",
        "is_violent_recid",
        "vr_charge_degree",
        "vr_charge_desc",
        "v_type_of_assessment",
        "v_score_text",
        "v_decile_score",
    ],
    inplace=True,
    axis=1
)  # remove features associated with violent recidivism

In [74]:
df.drop("vr_case_number", inplace=True, axis=1)  # remove features associated with violent recidivism

In [75]:
df.drop(
    [
        "r_case_number",
        "c_case_number",
        "id"
    ],
    inplace=True,
    axis=1
)  # features with unique values that are not useful for prediction

In [76]:
df.drop("type_of_assessment", inplace=True, axis=1)  # constant feature with value "Risk of Recidivism"

Drop features that have less than 20% non-NA values.

In [77]:
df.dropna(inplace=True, axis=1, thresh=df.shape[0] * 0.2)

Remove features that we are not going to use for prediction.

In [80]:
df.drop(
    [
        "start",
        "end",
        "event",
        "name",
        "last",
        "first"
    ],
    inplace=True,
    axis=1
)

In [82]:
missing_stat_after = calculate_nan_percentage_of_grouped_features(df, "data/compas-scores-two-years/features.yaml")
print("Missing values after preprocessing:")
display(missing_stat_after)

Missing values after preprocessing:


Unnamed: 0,Category,Feature,Missing_Count,Missing_Percentage
21,Covariate,r_offense_date,3182,51.622323
14,Covariate,c_offense_date,781,12.670344
2,Covariate,dob,0,0.0
11,Covariate,days_b_screening_arrest,0,0.0
10,Covariate,priors_count,0,0.0
9,Covariate,juv_other_count,0,0.0
8,Covariate,juv_misd_count,0,0.0
6,Covariate,juv_fel_count,0,0.0
15,Covariate,c_days_from_compas,0,0.0
0,Other,compas_screening_date,0,0.0


### Feature engineering

In [None]:
# TODO:
# age when current crime was commited
# how long in jail for current and recidivism crimes
# how long in custody


### Process categorical features

In [None]:
# TODO

### Process numerical variables

In [None]:
# TODO