In [16]:
import numpy as np
import pandas as pd
import utils

In [17]:
cols = ["days_b_screening_arrest",
    "is_recid",
    "c_charge_degree",
    "c_jail_out",
    "c_jail_in",
    "age_cat",
    "race",
    "sex",
    "two_year_recid",
    ]
recid = pd.read_csv("./compas_recid.csv", usecols=cols+["score_text"])
violent_recid = pd.read_csv("./compas_violent_recid.csv", usecols=cols+["v_score_text"])

In [18]:
def compas_cleaning(df, score_factor):
    new_df = df.dropna()
    new_df = new_df[(new_df["days_b_screening_arrest"]<=30)&
                    (new_df["days_b_screening_arrest"]>=-30)&
                    (new_df["is_recid"]!=-1)&
                    (new_df["c_charge_degree"]!="O")]
    new_df["length_of_stay"] = ((pd.to_datetime(df["c_jail_out"])-pd.to_datetime(df["c_jail_in"])).dt.days)
    new_df["length_of_stay"] = new_df["length_of_stay"].astype(int)
    new_df.drop(labels = ["c_jail_out", "c_jail_in", "days_b_screening_arrest"],axis = 1, inplace = True)
    new_df[score_factor] = new_df[score_factor].apply(lambda x: 1 if x=="High" else 0)
    new_df = pd.get_dummies(new_df, 
                            columns = ["c_charge_degree",
                                        "age_cat",
                                        "race",
                                        "sex"],
                            drop_first=True)
    return new_df

    

In [19]:
recid = compas_cleaning(recid, "score_text")
violent_recid = compas_cleaning(violent_recid, "v_score_text")

In [20]:
recid.head()

Unnamed: 0,is_recid,score_text,two_year_recid,length_of_stay,c_charge_degree_M,age_cat_Greater than 45,age_cat_Less than 25,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,sex_Male
0,0,0,0,0,0,1,0,0,0,0,0,1,1
1,1,0,1,10,0,0,0,0,0,0,0,0,1
2,1,0,1,1,0,0,1,0,0,0,0,0,1
5,0,0,0,1,1,0,0,0,0,0,0,1,1
6,1,0,1,6,0,0,0,0,1,0,0,0,1


In [21]:
print("SPD: ", utils.spd(recid["score_text"], recid["race_Caucasian"]))

SPD:  0.12030654752827094


In [26]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(recid.drop("score_text", axis = 1),recid["score_text"], test_size=0.33, random_state=0)
x_train["score_text"] = y_train
x_test["score_text"] = y_test

In [39]:
data = utils.data_remover_cat(x_train, "two_year_recid", 40, impute = "mean")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
print("SPD: ", utils.spd(data["score_text"], data["race_Caucasian"]))

SPD:  0.12455937474245786


In [41]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=0, max_iter=500)
preds = model.fit(data.drop("score_text", axis = 1),
                data["score_text"]).predict(x_test.drop("score_text", axis = 1))

In [42]:
utils.spd(preds, x_test["race_Caucasian"])

0.023297215111915164