In [4]:
import numpy as np
import pandas as pd




#------------------------- Feature Engineering -------------------------#

compas_scores_two_year= pd.read_csv("compas_scores_two_years.csv",  lineterminator='\n')

# Select features from dataset
df= compas_scores_two_year[[ 'juv_fel_count', 'juv_misd_count', 'juv_other_count' ,'age', 'c_charge_degree','race', 'score_text', 'sex', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid',  'c_jail_in', 'c_jail_out',  'v_decile_score','two_year_recid\r']]
# Process the data
df = df.loc[(df['days_b_screening_arrest'] <= 30) & (df['days_b_screening_arrest'] >= -30) & (df['is_recid'] != -1) & (df['c_charge_degree'] != 'O') & (df['score_text'] != 'N/A')]
#length of stay in jail 
df['length_of_stay'] = pd.to_datetime(df['c_jail_out']) - pd.to_datetime(df['c_jail_in'])
df['length_of_stay'] = df['length_of_stay'].astype('timedelta64[D]')
df['length_of_stay'] = df['length_of_stay'].astype(int)

#------------------------- Data Preprocessing -------------------------#
#split into caucasian and non-caucasian

df_CC = df.loc[df['race'] == 'Caucasian']
print('Caucasian', np.shape(df_CC))
df_AA = df.loc[df['race'] == 'African-American']
print('African-American', np.shape(df_AA))

df_NC = df.loc[df['race'] != 'Caucasian']

#equalizing odds 

def equilizing_odds(C):
    # #convert to probability
    # C = (1/len(F_true_score))*C
    # print(C)

    # False postive rates :Pr[ ˆY = 1/S = 1, Y = 0] − Pr[ ˆY = 0/S = 0, Y = 0]
    # False negative rates :Pr[ ˆY = 1/S = 1, Y = 1] − Pr[ ˆY = 0/S = 0, Y = 1]
    
    FNR = abs(C[0][0,1]/(C[0][0,1]+C[0][0,0]) - C[1][0,1]/(C[1][0,1]+C[1][0,0]))
    FPR = abs(C[0][1,0]/(C[0][1,0]+C[0][1,1]) - C[1][1,0]/(C[1][1,0]+C[1][1,1]))

    result = [FPR, FNR]

    return result



#------------------------- create factors  -------------------------#

           

def factoration (df_x):
    def length_factoration (df_x):
        df= df_x
        #length of stay in jail 
        df['length_of_stay'] = pd.to_datetime(df['c_jail_out']) - pd.to_datetime(df['c_jail_in'])
        df['length_of_stay'] = df['length_of_stay'].astype('timedelta64[D]')
        df['length_of_stay'] = df['length_of_stay'].astype(int)

        days = []
        weeks = []
        months = []
        years = []

        for length in df['length_of_stay']:
            if length<7:
                days.append(1)
                weeks.append(0)
                months.append(0)
                years.append(0)
            elif (length<30):
                days.append(0)
                weeks.append(1)
                months.append(0)
                years.append(0)
            elif length<365:
                days.append(0)
                weeks.append(0)
                months.append(1)
                years.append(0)
            else:
                days.append(0)
                weeks.append(0)
                months.append(0)
                years.append(1)

        return days, weeks, months, years

    def jail_factoration (df_x):
        jail_feature = []
        jail_feature_squared = []
        for jail in df['length_of_stay']:
            if(jail/365) > 1:
                jail_feature.append(1)
                jail_feature_squared.append((1)**2)
            else:
                jail_feature.append(jail/365)
                jail_feature_squared.append((jail/365)**2)
        return jail_feature, jail_feature_squared
  
    def age_factoration (df_x):
    
        df= df_x
        df_age = df['age'].astype(int)

        twenties_and_less = []
        thirties=[]
        fourties=[]
        fifties_and_more=[]

        for age in df_age:
            if age<30:
                twenties_and_less.append(1)
                thirties.append(0)
                fourties.append(0)
                fifties_and_more.append(0)
            elif age<40:
                twenties_and_less.append(0)
                thirties.append(1)
                fourties.append(0)
                fifties_and_more.append(0)
            elif age<50:
                twenties_and_less.append(0)
                thirties.append(0)
                fourties.append(1)
                fifties_and_more.append(0)
            else:
                twenties_and_less.append(0)
                thirties.append(0)
                fourties.append(0)
                fifties_and_more.append(1)

        return twenties_and_less, thirties, fourties, fifties_and_more
    
    def age_featrure (df_x):
        df= df_x
        df_age = df['age'].astype(int)
        mean_age = df_age.mean()
        age_feature_squared = []
        age_feature = []
        for age in df_age:
            if(age/mean_age) > 2:
                age_feature.append(2)
                age_feature_squared.append((2)**2)
            else:
                age_feature.append(age/mean_age)
                age_feature_squared.append((age/mean_age)**2)
        return age_feature, age_feature_squared

    def crime_factoration (df_x):
        df= df_x
        df_c_charge_degree = df[['c_charge_degree']] 
        crime_factor, u_charge_degree = pd.factorize(df_c_charge_degree['c_charge_degree'])
        return crime_factor

    def gender_factoration (df_x):
        df= df_x
        # Gender
        male = []
        female = []
        for gender in df['sex']:
            if gender == "Male":
                male.append(1)
                female.append(0)
            else:
                male.append(0)
                female.append(1)

        return male, female
    

    def priors_factoration (df_x):
        df= df_x
        
        # # Prior convictions
        juvinile_felonies  = df[['juv_fel_count']].astype(int)
        juvinile_misconduct  = df[['juv_misd_count']].astype(int)
        juvinile_other  = df[['juv_other_count']].astype(int)
        return juvinile_felonies, juvinile_misconduct, juvinile_other

    def prior_conviction_factoration (df_x):
        priors_count  = df['priors_count'].astype(int)
        no_prior_convictions = []
        one_prior =[]
        multiple_prior = []
        many_prior = []

        # Prior Convictions Feature
        for prior in priors_count:
            if prior==0:
                one_prior.append(0)
                multiple_prior.append(0)
                many_prior.append(0)
                no_prior_convictions.append(1)
            elif prior<2:
                one_prior.append(1)
                multiple_prior.append(0)
                many_prior.append(0)
                no_prior_convictions.append(0)
            elif prior<5:
                one_prior.append(0)
                multiple_prior.append(1)
                many_prior.append(0)
                no_prior_convictions.append(0)
            else:
                one_prior.append(0)
                multiple_prior.append(0)
                many_prior.append(1)
                no_prior_convictions.append(0)
        return no_prior_convictions, one_prior, multiple_prior, many_prior

    df= df_x
    #length factors 
    quick_stay, short_stay, medium_stay, long_stay = length_factoration(df)
    #age factors
    twenties_and_less, thirties, fourties, fifties_and_more = age_factoration(df)
    #charge factors
    crime_factor = crime_factoration(df)
    #gender factors
    male, non_male = gender_factoration(df)
    #priors factors
    juvinile_felonies, juvinile_misconduct, juvinile_other = priors_factoration(df)
    #prior convictions factors
    no_prior_convictions, one_prior, multiple_prior, many_prior = prior_conviction_factoration(df)
    #jail factors
    jail_feature, jail_feature_squared = jail_factoration(df)
    #age factors
    age_feature, age_feature_squared = age_featrure(df)

    X = np.column_stack((quick_stay, short_stay, medium_stay, long_stay, twenties_and_less, thirties, fourties, fifties_and_more, crime_factor, male, non_male, juvinile_felonies, juvinile_misconduct, juvinile_other, no_prior_convictions, one_prior, multiple_prior, many_prior, jail_feature, jail_feature_squared, age_feature, age_feature_squared))
    return X


data_CC = factoration(df_CC)
data_AA = factoration(df_NC)

# labels 
def labels(df_x):
    df= df_x
    f_score_text, u_score_text = pd.factorize(df['score_text'] != 'Low')
    two_year_recid = df[['two_year_recid\r']].astype(int)

    return f_score_text, two_year_recid

compas_score_CC, recid_CC = labels(df_CC)
compas_score_AA, recid_AA = labels(df_NC)


#------ split into train and test sets
from sklearn.model_selection import train_test_split
X_train_CC, X_test_CC, y_train_CC, y_test_CC = train_test_split(data_CC, compas_score_CC, test_size=0.2, random_state=0)
X_train_AA, X_test_AA, y_train_AA, y_test_AA = train_test_split(data_AA, compas_score_AA, test_size=0.2, random_state=0)

X_train_CC_recide, X_test_CC_recide, y_train_CC_recide, y_test_CC_recide = train_test_split(data_CC, recid_CC, test_size=0.2, random_state=0)
X_train_AA_recide, X_test_AA_recide, y_train_AA_recide, y_test_AA_recide = train_test_split(data_AA, recid_AA, test_size=0.2, random_state=0)

# loop to increa 

# --------- logistic regression --------------#

from sklearn.linear_model import LogisticRegression
#Logistic Regression model for CC
logreg = LogisticRegression()
logreg.fit(X_train_CC, np.ravel(y_train_CC))
y_pred_CC = logreg.predict(X_test_CC)

#Logistic Regression model for AA
#Optimal C = 1
logreg = LogisticRegression(penalty='l2', C=10)
logreg.fit(X_train_AA, np.ravel(y_train_AA))
y_pred_AA = logreg.predict(X_test_AA)


from sklearn.metrics import confusion_matrix
C = confusion_matrix(y_test_AA_recide, y_pred_AA ), confusion_matrix(y_test_CC_recide, y_pred_CC )

print('----------------------------------------------------')
print('------ Equlizing odds with Logistic Regression ------')
print('equlizing odds',equilizing_odds(C))
print('differnce in equlizing odds',equilizing_odds(C)[0]-equilizing_odds(C)[1])
# accuracy
from sklearn.metrics import accuracy_score
print('accuracy African AMericans', accuracy_score(y_test_AA, y_pred_AA))
print('accuracy Caucasians', accuracy_score(y_test_CC, y_pred_CC))


from sklearn.neighbors import KNeighborsClassifier
#KNN model for CC
knn = KNeighborsClassifier(n_neighbors = 45)
knn.fit(X_train_CC, np.ravel(y_train_CC))
y_pred_CC = knn.predict(X_test_CC)

#KNN model for AA
knn = KNeighborsClassifier(n_neighbors = 45)
knn.fit(X_train_AA, np.ravel(y_train_AA))
y_pred_AA = knn.predict(X_test_AA)

from sklearn.metrics import confusion_matrix
C = confusion_matrix(y_test_AA_recide, y_pred_AA ), confusion_matrix(y_test_CC_recide, y_pred_CC )

print('----------------------------------------------------')
print('------ Equlizing odds with KNN ------')
print('equlizing odds',equilizing_odds(C))
print('differnce in equlizing odds',equilizing_odds(C)[0]-equilizing_odds(C)[1])
# accuracy
from sklearn.metrics import accuracy_score
print('accuracy African AMericans', accuracy_score(y_test_AA, y_pred_AA))
print('accuracy Caucasians', accuracy_score(y_test_CC, y_pred_CC))



#------------------------- SVM model -------------------------#

from sklearn.svm import SVC
#SVM model for CC
svm = SVC(kernel = 'linear', C = 1)
svm.fit(X_train_CC, np.ravel(y_train_CC))
y_pred_CC = svm.predict(X_test_CC)

#SVM model for AA
svm = SVC(kernel = 'linear', C = 1)
svm.fit(X_train_AA, np.ravel(y_train_AA))
y_pred_AA = svm.predict(X_test_AA)

from sklearn.metrics import confusion_matrix
C = confusion_matrix(y_test_AA_recide, y_pred_AA ), confusion_matrix(y_test_CC_recide, y_pred_CC )

print('----------------------------------------------------')
print('------ Equlizing odds with SVM ------')
print('equlizing odds',equilizing_odds(C))
print('differnce in equlizing odds',equilizing_odds(C)[0]-equilizing_odds(C)[1])
# accuracy
from sklearn.metrics import accuracy_score
print('accuracy African AMericans', accuracy_score(y_test_AA, y_pred_AA))
print('accuracy Caucasians', accuracy_score(y_test_CC, y_pred_CC))


#-------------------------  MLP Model  -------------------------#

from sklearn.neural_network import MLPClassifier
#MLP model for CC
model = MLPClassifier(hidden_layer_sizes=(10), max_iter=1000)
model.fit(X_train_CC, np.ravel(y_train_CC))
y_pred_CC = model.predict(X_test_CC)

#MLP model for AA
model = MLPClassifier(hidden_layer_sizes=(10), max_iter=1000)
model.fit(X_train_AA, np.ravel(y_train_AA))
y_pred_AA = model.predict(X_test_AA)

from sklearn.metrics import confusion_matrix
C = confusion_matrix(y_test_AA_recide, y_pred_AA ), confusion_matrix(y_test_CC_recide, y_pred_CC )

print('----------------------------------------------------')
print('------ Equlizing odds with MLP ------')
print('equlizing odds',equilizing_odds(C))
print('differnce in equlizing odds',equilizing_odds(C)[0]-equilizing_odds(C)[1])
# accuracy
from sklearn.metrics import accuracy_score
print('accuracy African AMericans', accuracy_score(y_test_AA, y_pred_AA))
print('accuracy Caucasians', accuracy_score(y_test_CC, y_pred_CC))

Caucasian (2103, 17)
African-American (3175, 17)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['length_of_stay'] = pd.to_datetime(df['c_jail_out']) - pd.to_datetime(df['c_jail_in'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['length_of_stay'] = df['length_of_stay'].astype('timedelta64[D]')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['length_of_stay'] = df['length_of_stay'].as

----------------------------------------------------
------ Equlizing odds with Logistic Regression ------
equlizing odds [0.04387190232293442, 0.5234774403842855]
differnce in equlizing odds -0.47960553806135103
accuracy African AMericans 0.7346437346437347
accuracy Caucasians 0.7648456057007126
----------------------------------------------------
------ Equlizing odds with KNN ------
equlizing odds [0.03330056736854814, 0.5283410533539201]
differnce in equlizing odds -0.495040485985372
accuracy African AMericans 0.7223587223587223
accuracy Caucasians 0.7600950118764845
----------------------------------------------------
------ Equlizing odds with SVM ------
equlizing odds [0.029765292220670148, 0.5705867215645909]
differnce in equlizing odds -0.5408214293439207
accuracy African AMericans 0.7223587223587223
accuracy Caucasians 0.7648456057007126
----------------------------------------------------
------ Equlizing odds with MLP ------
equlizing odds [0.07805197716729612, 0.4921598902

In [22]:
print ('shape of data_CC', y_pred_CC.shape)
print ('shape of data_CC', recid_CC.shape)


shape of data_CC (421,)
shape of data_CC (2103, 1)
