In [26]:
import pandas as pd
import numpy as np

In [27]:
# Load the dataset
data = pd.read_csv('compas-scores-two-years.csv')



In [28]:
# Display the first few rows of the dataset
data.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,14-08-2013,Male,18-04-1947,69,Greater than 45,Other,...,1,Low,14-08-2013,07-07-2014,14-07-2014,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,27-01-2013,Male,22-01-1982,34,25 - 45,African-American,...,1,Low,27-01-2013,26-01-2013,05-02-2013,0,9,159,1,1
2,4,ed philo,ed,philo,14-04-2013,Male,14-05-1991,24,Less than 25,African-American,...,3,Low,14-04-2013,16-06-2013,16-06-2013,4,0,63,0,1
3,5,marcu brown,marcu,brown,13-01-2013,Male,21-01-1993,23,Less than 25,African-American,...,6,Medium,13-01-2013,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,26-03-2013,Male,22-01-1973,43,25 - 45,Other,...,1,Low,26-03-2013,,,2,0,1102,0,0


In [29]:
# Select relevant columns for the analysis
columns_of_interest = [
    "sex", "age", "age_cat", "race", "priors_count", "two_year_recid", "v_decile_score"
]
cleaned_data = data[columns_of_interest].dropna()

# Display the cleaned dataset
cleaned_data.head()


Unnamed: 0,sex,age,age_cat,race,priors_count,two_year_recid,v_decile_score
0,Male,69,Greater than 45,Other,0,0,1
1,Male,34,25 - 45,African-American,0,1,1
2,Male,24,Less than 25,African-American,4,1,3
3,Male,23,Less than 25,African-American,1,0,6
4,Male,43,25 - 45,Other,2,0,1


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Prepare features (X) and target (y)
X = cleaned_data[["priors_count", "v_decile_score", "age"]]  # Input features
y = cleaned_data["two_year_recid"]  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Test the model
y_pred = model.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)



Model Accuracy: 0.648960739030023


In [31]:
# Fairness Evaluation
from sklearn.metrics import precision_score, recall_score

# Compute metrics manually by group
groups = cleaned_data.loc[X_test.index, "race"].unique()  # Unique racial groups
group_metrics = {}

for group in groups:
    group_indices = (cleaned_data.loc[X_test.index, "race"] == group)
    group_y_true = y_test[group_indices]
    group_y_pred = y_pred[group_indices]
    
    group_metrics[group] = {
        "Precision": precision_score(group_y_true, group_y_pred, zero_division=0),
        "Recall": recall_score(group_y_true, group_y_pred, zero_division=0)
    }

# Convert the group_metrics dictionary to a pandas DataFrame
metrics_df = pd.DataFrame.from_dict(group_metrics, orient="index")

# Display the metrics as a table
metrics_df


Unnamed: 0,Precision,Recall
Caucasian,0.541284,0.427536
African-American,0.678373,0.632759
Other,0.475,0.452381
Hispanic,0.392157,0.344828
Native American,0.0,0.0
Asian,1.0,1.0


In [34]:
import shap

# Reinitialize the SHAP explainer
explainer = shap.TreeExplainer(model)


In [35]:
# Select a sample of the test set
X_sample = X_test.sample(10, random_state=42)

# Compute SHAP values
shap_values = explainer.shap_values(X_sample)


AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.

In [None]:

|