### Read In the Data

In [42]:
import pandas as pd

# Load the COMPAS dataset
data = pd.read_csv('compas-scores-two-years.csv')

# Display first few rows
data.head()



Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


In [108]:
# Check dataset structure
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 53 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       7214 non-null   int64  
 1   name                     7214 non-null   object 
 2   first                    7214 non-null   object 
 3   last                     7214 non-null   object 
 4   compas_screening_date    7214 non-null   object 
 5   sex                      7214 non-null   object 
 6   dob                      7214 non-null   object 
 7   age                      7214 non-null   int64  
 8   age_cat                  7214 non-null   object 
 9   race                     7214 non-null   object 
 10  juv_fel_count            7214 non-null   int64  
 11  decile_score             7214 non-null   int64  
 12  juv_misd_count           7214 non-null   int64  
 13  juv_other_count          7214 non-null   int64  
 14  priors_count            

### Preprocess the Data

In [146]:
data_processed = data
# Check for columns with a large percentage of missing values
missing_counts = data_processed.isna().sum()
print(missing_counts[missing_counts > 0])  # View columns with missing values

# Drop columns with a large proportion of NaN values (e.g., > 50% missing)
data_processed = data_processed.drop(columns=missing_counts[missing_counts > len(data_processed) * 0.5].index)


days_b_screening_arrest     307
c_jail_in                   307
c_jail_out                  307
c_case_number                22
c_offense_date             1159
c_arrest_date              6077
c_days_from_compas           22
c_charge_desc                29
r_case_number              3743
r_charge_degree            3743
r_days_from_arrest         4898
r_offense_date             3743
r_charge_desc              3801
r_jail_in                  4898
r_jail_out                 4898
violent_recid              7214
vr_case_number             6395
vr_charge_degree           6395
vr_offense_date            6395
vr_charge_desc             6395
in_custody                  236
out_custody                 236
dtype: int64


### Impute Missing values

In [147]:
# Impute numerical columns with the median value
data_processed.fillna(data_processed.median(), inplace=True)

# For categorical columns, you could use the mode (most frequent value)
categorical_columns = data_processed.select_dtypes(include=['object']).columns
for column in categorical_columns:
    data_processed[column].fillna(data_processed[column].mode()[0], inplace=True)

  data_processed.fillna(data_processed.median(), inplace=True)


In [135]:
data_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 40 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       7214 non-null   int64  
 1   name                     7214 non-null   object 
 2   first                    7214 non-null   object 
 3   last                     7214 non-null   object 
 4   compas_screening_date    7214 non-null   object 
 5   sex                      7214 non-null   object 
 6   dob                      7214 non-null   object 
 7   age                      7214 non-null   int64  
 8   age_cat                  7214 non-null   object 
 9   race                     7214 non-null   object 
 10  juv_fel_count            7214 non-null   int64  
 11  decile_score             7214 non-null   int64  
 12  juv_misd_count           7214 non-null   int64  
 13  juv_other_count          7214 non-null   int64  
 14  priors_count            

### Preprocess Data

#### Preprocess Date Info

In [148]:
# Convert date columns to datetime format
date_columns = ['compas_screening_date', 'c_jail_in', 'c_jail_out', 'v_screening_date', 'screening_date', 'in_custody', 'out_custody' ]  # Update with relevant date columns in your dataset
for col in date_columns:
    data_processed[col] = pd.to_datetime(data_processed[col], errors='coerce')  # Convert to datetime; invalid parsing will be set as NaT (missing)

# Extract year, month, and day as separate features
for col in date_columns:
    data_processed[f'{col}_year'] = data_processed[col].dt.year
    data_processed[f'{col}_month'] = data_processed[col].dt.month
    data_processed[f'{col}_day'] = data_processed[col].dt.day

# Calculate days since a reference date (e.g., 'compas_screening_date')
reference_date = pd.to_datetime('2013-01-01')  # Set an arbitrary reference date
data_processed['days_since_compas'] = (data_processed['compas_screening_date'] - reference_date).dt.days

data_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 62 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   id                           7214 non-null   int64         
 1   name                         7214 non-null   object        
 2   first                        7214 non-null   object        
 3   last                         7214 non-null   object        
 4   compas_screening_date        7214 non-null   datetime64[ns]
 5   sex                          7214 non-null   object        
 6   dob                          7214 non-null   object        
 7   age                          7214 non-null   int64         
 8   age_cat                      7214 non-null   object        
 9   race                         7214 non-null   object        
 10  juv_fel_count                7214 non-null   int64         
 11  decile_score                 7214 non-null 

In [149]:
print(data_processed.shape)

data_processed = data_processed.drop(columns=['id', 'name', 'first', 'last','age', 'priors_count', 
                           'compas_screening_date', 'dob',  
                          'c_case_number', 'c_offense_date',  
                          'c_jail_in', 'c_jail_out', 'v_screening_date',
                          'in_custody', 'out_custody',
                          'screening_date'])

print(data_processed.shape)



# Encode categorical features
data_processed = pd.get_dummies(data_processed, columns=['sex', 'race', 'c_charge_degree', 
                                                         'age_cat', 'c_charge_desc', 'score_text', 
                                                         'type_of_assessment', 'v_type_of_assessment', 'v_score_text'
                                                        ], drop_first=False)
                                                         

# Drop columns with names or non-numeric data that aren't needed for modeling
print(data_processed.shape)


# Separate features and target variable
y = data_processed['two_year_recid']
X = data_processed.drop('two_year_recid', axis=1)

X.info()


(7214, 62)
(7214, 46)
(7214, 495)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Columns: 494 entries, juv_fel_count to v_score_text_Medium
dtypes: float64(2), int64(34), uint8(458)
memory usage: 5.1 MB


In [150]:
data_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Columns: 495 entries, juv_fel_count to v_score_text_Medium
dtypes: float64(2), int64(35), uint8(458)
memory usage: 5.2 MB


### Train a Baseline Model

In [151]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Logistic Regression model with a higher max_iter
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Make predictions and evaluate
y_pred = model.predict(X_test_scaled)
baseline_accuracy = accuracy_score(y_test, y_pred)
print(f"Baseline Accuracy: {baseline_accuracy:.2f}")


Baseline Accuracy: 0.99


In [116]:
# For Logistic Regression
print("Feature coefficients:", model.coef_)

Feature coefficients: [[-1.16709838e-02  1.42326287e-01  1.01114965e-02 -4.29835596e-02
  -1.86532073e-01  1.24703365e-01  5.10532564e+00 -1.07957187e-01
   1.42326287e-01 -2.31461527e-01  2.82866166e-01  8.74831940e-01
  -4.59063706e+00  1.75857726e+00  1.11223176e-01  2.70600964e-01
   6.61961003e-03  2.11036604e-01 -7.60002830e-03  2.67475739e-01
   3.98965213e-01 -3.22309397e-01 -3.51251879e-01  1.11223176e-01
   2.70600964e-01  6.61961003e-03  1.11223176e-01  2.70600964e-01
   6.61961003e-03 -7.20127322e-02  1.39443661e-01 -8.45322417e-03
   6.73358657e-02 -1.32884455e-02  1.94561534e-02  2.59255620e-01
   1.19976971e-01 -1.44808624e-02  8.88086552e-02  2.45682913e-01
   7.96606258e-02  1.69790300e-01 -4.02195721e-02 -2.85623366e-01
   3.94407692e-02  3.41956730e-02  0.00000000e+00 -2.43981308e-03
  -9.22866568e-02 -2.82910534e-02 -6.60883112e-05  9.46382216e-03
   1.63992658e-02  8.09475151e-02 -2.27135625e-02  1.81983086e-02
  -7.79952109e-02 -4.18711784e-02  1.60549012e-01  8.0

In [152]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[810  13]
 [  8 612]]


In [153]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Precision: 0.98
Recall: 0.99
F1 Score: 0.98


In [154]:
X.columns[0:50]

Index(['juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count',
       'days_b_screening_arrest', 'c_days_from_compas', 'is_recid',
       'is_violent_recid', 'decile_score.1', 'v_decile_score',
       'priors_count.1', 'start', 'end', 'event', 'compas_screening_date_year',
       'compas_screening_date_month', 'compas_screening_date_day',
       'c_jail_in_year', 'c_jail_in_month', 'c_jail_in_day', 'c_jail_out_year',
       'c_jail_out_month', 'c_jail_out_day', 'v_screening_date_year',
       'v_screening_date_month', 'v_screening_date_day', 'screening_date_year',
       'screening_date_month', 'screening_date_day', 'in_custody_year',
       'in_custody_month', 'in_custody_day', 'out_custody_year',
       'out_custody_month', 'out_custody_day', 'days_since_compas',
       'sex_Female', 'sex_Male', 'race_African-American', 'race_Asian',
       'race_Caucasian', 'race_Hispanic', 'race_Native American', 'race_Other',
       'c_charge_degree_F', 'c_charge_degree_M', 'age_cat_

In [164]:
import pandas as pd
from sklearn.metrics import confusion_matrix

# Subset predictions by race (e.g., Other vs. Caucasian)
X_test_black = X_test[X_test['race_African-American'] == 1]
y_test_black = y_test[X_test['race_African-American'] == 1]
y_pred_black = model.predict(X_test_black)

X_test_white = X_test[X_test['race_Caucasian'] == 1]
y_test_white = y_test[X_test['race_Caucasian'] == 1]
y_pred_white = model.predict(X_test_white)

# Confusion matrix for each group
print("Confusion Matrix (Black):")
print(confusion_matrix(y_test_black, y_pred_black))

print("Confusion Matrix (White):")
print(confusion_matrix(y_test_white, y_pred_white))

Confusion Matrix (Black):
[[310  49]
 [ 47 325]]
Confusion Matrix (White):
[[280  40]
 [ 31 154]]




In [176]:
def calculate_rates(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    false_positive_rate = fp / (fp + tn)
    false_negative_rate = fn / (fn + tp)
    return false_positive_rate, false_negative_rate

In [179]:
# Calculate False Positive Rate (FPR) for Black and White defendants
fpr_black, fnr_black =  calculate_rates(y_test_black, y_pred_black)
fpr_white, fnr_white =  calculate_rates(y_test_white, y_pred_white)

print(f"False Positive Rate for Black defendants: {fpr_black:.2f} ({fpr_black * 100:.1f}%)")
print(f"False Positive Rate for White defendants: {fpr_white:.2f} ({fpr_white * 100:.1f}%)")

print(f"False Negative Rate for Black defendants: {fnr_black:.2f} ({fnr_black * 100:.1f}%)")
print(f"False Negative Rate for White defendants: {fnr_white:.2f} ({fnr_white * 100:.1f}%)")

False Positive Rate for Black defendants: 0.14 (13.6%)
False Positive Rate for White defendants: 0.12 (12.5%)
False Negative Rate for Black defendants: 0.13 (12.9%)
False Negative Rate for White defendants: 0.17 (16.8%)


### Mitigate Bias using Reweighting based on Race

In [183]:
from sklearn.utils.class_weight import compute_sample_weight
import numpy as np

# Base sample weights based on the target class balance
base_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Additional weights based on race
# Let's assume 'race_African-American' is 1 for Black defendants, 0 otherwise
race_factor = np.where(X_train['race_African-American'] == 1, 1.5, 1.0)  # Example: weight Black defendants more

# Final sample weights to balance both race and target class
sample_weights = base_weights * race_factor

# Train the model with these custom sample weights
model_weighted = LogisticRegression(max_iter=1000)
model_weighted.fit(X_train_scaled, y_train, sample_weight=sample_weights)

# Make predictions with the reweighted model
y_pred_weighted = model_weighted.predict(X_test_scaled)

# Evaluate the reweighted model's accuracy
weighted_accuracy = accuracy_score(y_test, y_pred_weighted)
print(f"Weighted Model Accuracy: {weighted_accuracy:.2f}")


Weighted Model Accuracy: 0.99


### Retest prediction by Race

In [184]:
# Evaluate performance for each race with the reweighted model
y_pred_weighted_black = model_weighted.predict(X_test_black)
y_pred_weighted_white = model_weighted.predict(X_test_white)

# Calculate false positive and false negative rates for each group
black_weighted_fp_rate, black_weighted_fn_rate = calculate_rates(y_test_black, y_pred_weighted_black)
white_weighted_fp_rate, white_weighted_fn_rate = calculate_rates(y_test_white, y_pred_weighted_white)

print(f"Weighted False Positive Rate (Black): {black_weighted_fp_rate:.2f} ({black_weighted_fp_rate * 100:.1f}%)")
print(f"Weighted False Positive Rate (White): {white_weighted_fp_rate:.2f} ({white_weighted_fp_rate * 100:.1f}%)")
print(f"Weighted False Negative Rate (Black): {black_weighted_fn_rate:.2f} ({black_weighted_fn_rate * 100:.1f}%)")
print(f"Weighted False Negative Rate (White): {white_weighted_fn_rate:.2f} ({white_weighted_fn_rate * 100:.1f}%)")

Weighted False Positive Rate (Black): 0.14 (13.6%)
Weighted False Positive Rate (White): 0.13 (12.8%)
Weighted False Negative Rate (Black): 0.13 (12.6%)
Weighted False Negative Rate (White): 0.16 (16.2%)


