# COMPAS data preprocessing - mk1
#### 17 May 2022

## Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
pd.options.display.max_columns = 250
pd.options.display.max_rows = 250

### Versions

In [4]:
print("*** Versions ***")
print("numpy:\t\t", np.__version__)
print("pandas:\t\t", pd.__version__)

*** Versions ***
numpy:		 1.21.6
pandas:		 1.4.1


## Custom Functions

#### Save to persistent storage

In [5]:
def custom_save(name, data, kind=1):
    '''
    name : string
        designated filename
    data : data or pytorch model
        the data to save
    kind : int
        sentinel value - 1 if pytorch model, 0 otherwise
    
    custom_save stores the data passed into the function into a file with the provided name
    '''
    
    if kind == 1:
        ex = ".pth"
    else:
        ex = ".parquet"
    
    sentinel = True
    i = 1

    while sentinel:
        dirlist = os.listdir()

        if name not in dirlist:
            if kind == 1:
                torch.save(data, name)
            else:
                data.to_parquet(name)
            print(f"{name} has been saved.")                
            sentinel = False
        if name in dirlist:
            print(f"{name} already exists.", end=" ")
            temp, ext = name.split(ex)
            if "_v" in temp:
                temp, _ = temp.split("_v")
            name = f"{temp}_v{i}{ex}"
            i = i + 1
            print(f"Changing file name to: {name}")

## Import Data

In [6]:
name = "compas-scores-two-years.csv"

data = pd.read_csv(name)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 53 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       7214 non-null   int64  
 1   name                     7214 non-null   object 
 2   first                    7214 non-null   object 
 3   last                     7214 non-null   object 
 4   compas_screening_date    7214 non-null   object 
 5   sex                      7214 non-null   object 
 6   dob                      7214 non-null   object 
 7   age                      7214 non-null   int64  
 8   age_cat                  7214 non-null   object 
 9   race                     7214 non-null   object 
 10  juv_fel_count            7214 non-null   int64  
 11  decile_score             7214 non-null   int64  
 12  juv_misd_count           7214 non-null   int64  
 13  juv_other_count          7214 non-null   int64  
 14  priors_count            

In [8]:
data.shape

(7214, 53)

In [9]:
data.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_jail_in,c_jail_out,c_case_number,c_offense_date,c_arrest_date,c_days_from_compas,c_charge_degree,c_charge_desc,is_recid,r_case_number,r_charge_degree,r_days_from_arrest,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,violent_recid,is_violent_recid,vr_case_number,vr_charge_degree,vr_offense_date,vr_charge_desc,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,0,1,0,0,0,-1.0,2013-08-13 06:03:42,2013-08-14 05:41:20,13011352CF10A,2013-08-13,,1.0,F,Aggravated Assault w/Firearm,0,,,,,,,,,0,,,,,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,0,3,0,0,0,-1.0,2013-01-26 03:45:27,2013-02-05 05:36:53,13001275CF10A,2013-01-26,,1.0,F,Felony Battery w/Prior Convict,1,13009779CF10A,(F3),,2013-07-05,Felony Battery (Dom Strang),,,,1,13009779CF10A,(F3),2013-07-05,Felony Battery (Dom Strang),Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,0,4,0,1,4,-1.0,2013-04-13 04:58:34,2013-04-14 07:02:04,13005330CF10A,2013-04-13,,1.0,F,Possession of Cocaine,1,13011511MM10A,(M1),0.0,2013-06-16,Driving Under The Influence,2013-06-16,2013-06-16,,0,,,,,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,0,8,1,0,1,,,,13000570CF10A,2013-01-12,,1.0,F,Possession of Cannabis,0,,,,,,,,,0,,,,,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,0,1,0,0,2,,,,12014130CF10A,,2013-01-09,76.0,F,arrest case no charge,0,,,,,,,,,0,,,,,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,2,0,1102,0,0


#### Check for null data

In [10]:
data.isnull().sum()

id                            0
name                          0
first                         0
last                          0
compas_screening_date         0
sex                           0
dob                           0
age                           0
age_cat                       0
race                          0
juv_fel_count                 0
decile_score                  0
juv_misd_count                0
juv_other_count               0
priors_count                  0
days_b_screening_arrest     307
c_jail_in                   307
c_jail_out                  307
c_case_number                22
c_offense_date             1159
c_arrest_date              6077
c_days_from_compas           22
c_charge_degree               0
c_charge_desc                29
is_recid                      0
r_case_number              3743
r_charge_degree            3743
r_days_from_arrest         4898
r_offense_date             3743
r_charge_desc              3801
r_jail_in                  4898
r_jail_o

In [11]:
size = data.shape[0]

#### If the feature has more than 10% missing or null values, drop the column.

In [12]:
data.notna().sum()

id                         7214
name                       7214
first                      7214
last                       7214
compas_screening_date      7214
sex                        7214
dob                        7214
age                        7214
age_cat                    7214
race                       7214
juv_fel_count              7214
decile_score               7214
juv_misd_count             7214
juv_other_count            7214
priors_count               7214
days_b_screening_arrest    6907
c_jail_in                  6907
c_jail_out                 6907
c_case_number              7192
c_offense_date             6055
c_arrest_date              1137
c_days_from_compas         7192
c_charge_degree            7214
c_charge_desc              7185
is_recid                   7214
r_case_number              3471
r_charge_degree            3471
r_days_from_arrest         2316
r_offense_date             3471
r_charge_desc              3413
r_jail_in                  2316
r_jail_o

In [13]:
threshhold = int(np.round(.9*size))
threshhold

6493

In [14]:
temp = data.dropna(axis=1, thresh=threshhold)

In [15]:
temp.isnull().sum()

id                           0
name                         0
first                        0
last                         0
compas_screening_date        0
sex                          0
dob                          0
age                          0
age_cat                      0
race                         0
juv_fel_count                0
decile_score                 0
juv_misd_count               0
juv_other_count              0
priors_count                 0
days_b_screening_arrest    307
c_jail_in                  307
c_jail_out                 307
c_case_number               22
c_days_from_compas          22
c_charge_degree              0
c_charge_desc               29
is_recid                     0
is_violent_recid             0
type_of_assessment           0
decile_score.1               0
score_text                   0
screening_date               0
v_type_of_assessment         0
v_decile_score               0
v_score_text                 0
v_screening_date             0
in_custo

In [16]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 39 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       7214 non-null   int64  
 1   name                     7214 non-null   object 
 2   first                    7214 non-null   object 
 3   last                     7214 non-null   object 
 4   compas_screening_date    7214 non-null   object 
 5   sex                      7214 non-null   object 
 6   dob                      7214 non-null   object 
 7   age                      7214 non-null   int64  
 8   age_cat                  7214 non-null   object 
 9   race                     7214 non-null   object 
 10  juv_fel_count            7214 non-null   int64  
 11  decile_score             7214 non-null   int64  
 12  juv_misd_count           7214 non-null   int64  
 13  juv_other_count          7214 non-null   int64  
 14  priors_count            

#### Inspect again

In [17]:
temp.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_jail_in,c_jail_out,c_case_number,c_days_from_compas,c_charge_degree,c_charge_desc,is_recid,is_violent_recid,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,0,1,0,0,0,-1.0,2013-08-13 06:03:42,2013-08-14 05:41:20,13011352CF10A,1.0,F,Aggravated Assault w/Firearm,0,0,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,0,3,0,0,0,-1.0,2013-01-26 03:45:27,2013-02-05 05:36:53,13001275CF10A,1.0,F,Felony Battery w/Prior Convict,1,1,Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,0,4,0,1,4,-1.0,2013-04-13 04:58:34,2013-04-14 07:02:04,13005330CF10A,1.0,F,Possession of Cocaine,1,0,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,0,8,1,0,1,,,,13000570CF10A,1.0,F,Possession of Cannabis,0,0,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,0,1,0,0,2,,,,12014130CF10A,76.0,F,arrest case no charge,0,0,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,2,0,1102,0,0


#### Eliminate features with too many categorical values (>10%) or too few categorical values (ie, 1).

In [18]:
obj_cols = temp.select_dtypes(include=['object']).columns
obj_cols

Index(['name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age_cat', 'race', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_charge_degree', 'c_charge_desc', 'type_of_assessment', 'score_text',
       'screening_date', 'v_type_of_assessment', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody'],
      dtype='object')

In [19]:
for_elimination = []

for col in obj_cols:
    threshhold = .1
    val = temp[col].nunique() / size
    if val > threshhold:
        print(f"{col:23s}: {val:0.2f}")
        for_elimination.append(col)

name                   : 0.99
first                  : 0.39
last                   : 0.55
dob                    : 0.76
c_jail_in              : 0.96
c_jail_out             : 0.95
c_case_number          : 1.00
in_custody             : 0.16
out_custody            : 0.16


In [20]:
temp = temp.drop(for_elimination, axis=1)

#### Inspect  data

In [21]:
temp.head()

Unnamed: 0,id,compas_screening_date,sex,age,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,c_charge_desc,is_recid,is_violent_recid,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,priors_count.1,start,end,event,two_year_recid
0,1,2013-08-14,Male,69,Greater than 45,Other,0,1,0,0,0,-1.0,1.0,F,Aggravated Assault w/Firearm,0,0,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,0,0,327,0,0
1,3,2013-01-27,Male,34,25 - 45,African-American,0,3,0,0,0,-1.0,1.0,F,Felony Battery w/Prior Convict,1,1,Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,0,9,159,1,1
2,4,2013-04-14,Male,24,Less than 25,African-American,0,4,0,1,4,-1.0,1.0,F,Possession of Cocaine,1,0,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,4,0,63,0,1
3,5,2013-01-13,Male,23,Less than 25,African-American,0,8,1,0,1,,1.0,F,Possession of Cannabis,0,0,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,1,0,1174,0,0
4,6,2013-03-26,Male,43,25 - 45,Other,0,1,0,0,2,,76.0,F,arrest case no charge,0,0,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,2,0,1102,0,0


In [22]:
templist = []
for col in temp.columns:
    if temp[col].nunique() < 11:
        templist = list(temp[col].unique())
        print(f"{temp[col].dtypes}\t{col:23s}: {temp[col].nunique()}\t{templist}")
    else:
        print(f"{temp[col].dtypes}\t{col:23s}: {temp[col].nunique()}")

int64	id                     : 7214
object	compas_screening_date  : 690
object	sex                    : 2	['Male', 'Female']
int64	age                    : 65
object	age_cat                : 3	['Greater than 45', '25 - 45', 'Less than 25']
object	race                   : 6	['Other', 'African-American', 'Caucasian', 'Hispanic', 'Native American', 'Asian']
int64	juv_fel_count          : 11
int64	decile_score           : 10	[1, 3, 4, 8, 6, 10, 5, 9, 2, 7]
int64	juv_misd_count         : 10	[0, 1, 6, 12, 2, 4, 3, 8, 5, 13]
int64	juv_other_count        : 10	[0, 1, 2, 3, 4, 17, 9, 5, 6, 7]
int64	priors_count           : 37
float64	days_b_screening_arrest: 423
float64	c_days_from_compas     : 499
object	c_charge_degree        : 2	['F', 'M']
object	c_charge_desc          : 437
int64	is_recid               : 2	[0, 1]
int64	is_violent_recid       : 2	[0, 1]
object	type_of_assessment     : 1	['Risk of Recidivism']
int64	decile_score.1         : 10	[1, 3, 4, 8, 6, 10, 5, 9, 2, 7]
object	score_text 

In [23]:
temp["compas_screening_date"].sum() == temp["screening_date"].sum()

True

In [24]:
temp["v_screening_date"].sum() == temp["screening_date"].sum()

True

In [25]:
dead_features = ["id",
                 "compas_screening_date",
                 "screening_date",
                 "v_screening_date",
                 "c_charge_desc",
                 "type_of_assessment",
                 "v_type_of_assessment"
                ]

#### Features dropped:
1. id: serial count of sample
2. name: mostly unique values
3. first: mostly unique values
4. last: mostly unique values
5. dob: captured in age; redundant
6. c_jail_in
7. c_jail_out
8. c_case_number: unique per sample
9. in_custody
10. out_custody
11. compas_screening_date: unnecessary
12. screening_date: redundancy
13. v_screening_dat: redundancy
14. c_charge_desc: abundance of categorical, non-mutually exclusive values
15. type_of_assessment: single value
16. v_type_of_assessment: single value

In [26]:
temp2 = temp.drop(dead_features, axis=1)

In [27]:
temp2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sex                      7214 non-null   object 
 1   age                      7214 non-null   int64  
 2   age_cat                  7214 non-null   object 
 3   race                     7214 non-null   object 
 4   juv_fel_count            7214 non-null   int64  
 5   decile_score             7214 non-null   int64  
 6   juv_misd_count           7214 non-null   int64  
 7   juv_other_count          7214 non-null   int64  
 8   priors_count             7214 non-null   int64  
 9   days_b_screening_arrest  6907 non-null   float64
 10  c_days_from_compas       7192 non-null   float64
 11  c_charge_degree          7214 non-null   object 
 12  is_recid                 7214 non-null   int64  
 13  is_violent_recid         7214 non-null   int64  
 14  decile_score.1          

In [28]:
temp2.isnull().sum()

sex                          0
age                          0
age_cat                      0
race                         0
juv_fel_count                0
decile_score                 0
juv_misd_count               0
juv_other_count              0
priors_count                 0
days_b_screening_arrest    307
c_days_from_compas          22
c_charge_degree              0
is_recid                     0
is_violent_recid             0
decile_score.1               0
score_text                   0
v_decile_score               0
v_score_text                 0
priors_count.1               0
start                        0
end                          0
event                        0
two_year_recid               0
dtype: int64

#### Drop remaining missing values for data integrity

In [29]:
temp3 = temp2.dropna()

In [30]:
temp3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6907 entries, 0 to 7213
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sex                      6907 non-null   object 
 1   age                      6907 non-null   int64  
 2   age_cat                  6907 non-null   object 
 3   race                     6907 non-null   object 
 4   juv_fel_count            6907 non-null   int64  
 5   decile_score             6907 non-null   int64  
 6   juv_misd_count           6907 non-null   int64  
 7   juv_other_count          6907 non-null   int64  
 8   priors_count             6907 non-null   int64  
 9   days_b_screening_arrest  6907 non-null   float64
 10  c_days_from_compas       6907 non-null   float64
 11  c_charge_degree          6907 non-null   object 
 12  is_recid                 6907 non-null   int64  
 13  is_violent_recid         6907 non-null   int64  
 14  decile_score.1          

**Note:** It seems like we should have just dropped all the missing columns to begin with, but this way we ensure as much data integrity as possible

In [31]:
for col in temp3.columns:
    if temp3[col].nunique() <= 2:
        print(col)

sex
c_charge_degree
is_recid
is_violent_recid
event
two_year_recid


In [32]:
df = pd.get_dummies(temp3)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6907 entries, 0 to 7213
Data columns (total 36 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      6907 non-null   int64  
 1   juv_fel_count            6907 non-null   int64  
 2   decile_score             6907 non-null   int64  
 3   juv_misd_count           6907 non-null   int64  
 4   juv_other_count          6907 non-null   int64  
 5   priors_count             6907 non-null   int64  
 6   days_b_screening_arrest  6907 non-null   float64
 7   c_days_from_compas       6907 non-null   float64
 8   is_recid                 6907 non-null   int64  
 9   is_violent_recid         6907 non-null   int64  
 10  decile_score.1           6907 non-null   int64  
 11  v_decile_score           6907 non-null   int64  
 12  priors_count.1           6907 non-null   int64  
 13  start                    6907 non-null   int64  
 14  end                     

In [34]:
droplist = ["sex_Male", "c_charge_degree_M"]
rename_dict = {"sex_Female": "sex", "c_charge_degree_F" : "c_charge_degree"}

**Note:**<br>
sex: 1 = F, 0 = M;<br>c_charge_degree: 1 = F, 0 = M

In [35]:
df = df.drop(droplist, axis=1)

In [36]:
df = df.rename(rename_dict, axis=1)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6907 entries, 0 to 7213
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      6907 non-null   int64  
 1   juv_fel_count            6907 non-null   int64  
 2   decile_score             6907 non-null   int64  
 3   juv_misd_count           6907 non-null   int64  
 4   juv_other_count          6907 non-null   int64  
 5   priors_count             6907 non-null   int64  
 6   days_b_screening_arrest  6907 non-null   float64
 7   c_days_from_compas       6907 non-null   float64
 8   is_recid                 6907 non-null   int64  
 9   is_violent_recid         6907 non-null   int64  
 10  decile_score.1           6907 non-null   int64  
 11  v_decile_score           6907 non-null   int64  
 12  priors_count.1           6907 non-null   int64  
 13  start                    6907 non-null   int64  
 14  end                     

####  Esthetic feature name changes

In [38]:
rename_dict2 = {"age_cat_25 - 45" : "age_cat_25to45",
               "age_cat_Greater than 45" : "age_cat_over45",
               "age_cat_Less than 25" : "age_cat_under25",
               "race_African-American": "race_black",
               "race_Asian" : "race_asian",
               "race_Caucasian" : "race_white",
               "race_Hispanic" : "race_hispanic",
               "race_Native American": "race_native",
               "race_other": "race_other"
               }

In [39]:
df = df.rename(rename_dict2, axis=1)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6907 entries, 0 to 7213
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      6907 non-null   int64  
 1   juv_fel_count            6907 non-null   int64  
 2   decile_score             6907 non-null   int64  
 3   juv_misd_count           6907 non-null   int64  
 4   juv_other_count          6907 non-null   int64  
 5   priors_count             6907 non-null   int64  
 6   days_b_screening_arrest  6907 non-null   float64
 7   c_days_from_compas       6907 non-null   float64
 8   is_recid                 6907 non-null   int64  
 9   is_violent_recid         6907 non-null   int64  
 10  decile_score.1           6907 non-null   int64  
 11  v_decile_score           6907 non-null   int64  
 12  priors_count.1           6907 non-null   int64  
 13  start                    6907 non-null   int64  
 14  end                     

In [41]:
event = df.event
target = df.two_year_recid

In [42]:
df = df.drop(["event", "two_year_recid"], axis=1)

In [43]:
df["event"] = event

In [44]:
df["target"] = target

In [45]:
name = "compas_mk1.parquet"
custom_save(name, df, 0)

compas_mk1.parquet already exists. Changing file name to: compas_mk1_v1.parquet
compas_mk1_v1.parquet has been saved.
