In [133]:
import pandas as pd
import altair as alt
import numpy as np
alt.data_transformers.disable_max_rows()
#alt.data_transformers.enable('json')

DataTransformerRegistry.enable('default')

In [134]:
def subset(df,cols):
    """Takes a dataframe, columns to select from df as list (cols) and returns subsetted result.
    
    """
    
    return df[cols]

In [125]:
## Scores Data

df = pd.read_csv('https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv')

cols_to_drop= ['id', 'name', 'first', 'last', 'compas_screening_date', 
               'sex', 
               'dob',
               #'age', 
               'age_cat', 
               'race', 
               #'priors_count',
               #'decile_score',
               'juv_fel_count', 'juv_misd_count', 'juv_other_count',
                'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
                'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
                'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
                'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
                'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
                'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
                'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
                'decile_score.1', 'score_text', 'screening_date',
                'v_type_of_assessment', 'v_decile_score', 'v_score_text',
                'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
                'start', 'end', 'event', 
                #'two_year_recid'
              ]

## Data Filtering (matching ProPublica analysis)

In [147]:
df['race'].unique()

array(['Other', 'African-American', 'Caucasian', 'Hispanic',
       'Native American', 'Asian'], dtype=object)

In [162]:
df = (df       # Start Experiment
       # check to see what happens if we exclude all races except white and black
      .where(df['race'].isin(['African-American', 'Caucasian']),np.nan)
      .dropna(subset='race')
       # End Experiment
      )

In [167]:
dfc = (df
      .where(df['days_b_screening_arrest'] <=30,np.nan)
      .where(df['days_b_screening_arrest'] >= -30,np.nan)
      .where(df['is_recid'] != -1,np.nan)
      .where(df['c_charge_degree'] != "O",np.nan)
      .where(df['score_text'] != "N/A",np.nan)
    # Start Experiment
       # check to see what happens if we exclude all races except white and black
      .where(df['race'].isin(['African-American', 'Caucasian']),np.nan)
      .dropna(subset='race')
    # End Experiment
      .dropna(subset=['days_b_screening_arrest','is_recid','c_charge_degree','score_text'])
      .drop(columns=cols_to_drop)
      .rename(columns={'decile_score':'score',
                       'priors_count':'priors',
                       'two_year_recid':'recidivate'})
      .convert_dtypes() # Turn float columns to int
      .assign(recidivate= lambda x: x['recidivate'].astype(int)) # Needed for weird reason; sklearn doesn't know about Int64 yet
      .reset_index(drop=True)
     )

print(len(dfc))

assert len(dfc) == 6172, "Does not match ProPublica analysis, check!"

5278


AssertionError: Does not match ProPublica analysis, check!

In [168]:
dfc

Unnamed: 0,age,score,priors,recidivate
0,34,3,0,1
1,24,4,4,1
2,41,6,14,1
3,39,1,0,0
4,27,4,0,0
...,...,...,...,...
5273,30,2,0,1
5274,20,9,0,0
5275,23,7,0,0
5276,23,3,0,0


## Try KNN

In [169]:
# Imports from sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [241]:
X = subset(dfc, ['age','priors'])
y = subset(dfc, ['recidivate'])

In [242]:
X

Unnamed: 0,age,priors
0,34,0
1,24,4
2,41,14
3,39,0
4,27,0
...,...,...
5273,30,0
5274,20,0
5275,23,0
5276,23,0


In [243]:
y

Unnamed: 0,recidivate
0,1
1,1
2,1
3,0
4,0
...,...
5273,1
5274,0
5275,0
5276,0


## Train-test split

In [244]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100,test_size=0.2)

## Feature Scaling

In [245]:
sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

## Run KNN

In [343]:
# Define the model
classifier = KNeighborsClassifier(n_neighbors=11,p=2,metric='euclidean')

In [344]:
# Fit model
classifier.fit(X_train,y_train)

  return self._fit(X, y)


KNeighborsClassifier(metric='euclidean', n_neighbors=11)

## Predict results

In [345]:
y_pred = classifier.predict(X_test)

In [346]:
cm = confusion_matrix(y_test,y_pred)

In [347]:
cm

array([[383, 203],
       [161, 309]])

In [348]:
accuracy_score(y_test,y_pred)

0.6553030303030303

In [252]:
f1_score(y_test,y_pred)

0.6305015353121801

# Plot

In [99]:
alt.Chart(dfc).mark_circle().encode(alt.X('age'),alt.Y('priors'))