Import our tools

In [1]:
import pandas as pd

Load in the relevant data, parsing the date fields to a format pandas understands on the way.

In [2]:
df = pd.read_csv(
    'compas-scores.csv',
    usecols=[
        'sex',
        'age',
        'age_cat',
        'race',
        'juv_fel_count', 
        'decile_score', 
        'juv_misd_count',
        'juv_other_count', 
        'priors_count', 
        'days_b_screening_arrest',
        'c_jail_in', 
        'c_jail_out',
        'c_offense_date',
        'c_arrest_date', 
        'c_days_from_compas', 
        'c_charge_degree',
        'c_charge_desc', 
        'is_recid', 
        'num_r_cases', 
        'r_charge_degree', 
        'r_days_from_arrest', 
        'r_offense_date',
        'r_charge_desc', 
        'r_jail_in', 
        'r_jail_out', 
        'is_violent_recid',
        'num_vr_cases', 
        'vr_charge_degree', 
        'vr_offense_date',
        'vr_charge_desc', 
        #'v_decile_score',
        #'v_score_text', 
        'v_screening_date', 
        #'decile_score.1', 
        #'score_text', 
    ],
    parse_dates=[
        'c_jail_in', 
        'c_jail_out', 
        'c_offense_date', 
        'c_arrest_date', 
        'r_offense_date', 
        'r_jail_in', 
        'r_jail_out', 
        'vr_offense_date',
        'v_screening_date'
    ]
)

KeyError: 'v_screening_date'

Transform all string data to lowercase and remove extra whitespace

In [None]:
df = df.applymap(lambda s : s.lower().strip() if type(s) == str else s)

Transform all string columns to groups of binary columns

In [None]:
df = pd.get_dummies(df)

Check if we have any more object columns (should be none)

In [None]:
df.dtypes.value_counts()

Calculate jail stay lengths

In [None]:
df['c_days_in_jail'] = (df['c_jail_out'] - df['c_jail_in']).dt.days.fillna(0).astype(int)
df['r_days_in_jail'] = (df['r_jail_out'] - df['r_jail_in']).dt.days.fillna(0).astype(int)

Drop date columns

In [None]:
df.drop([
        'c_jail_in', 
        'c_jail_out', 
        'c_offense_date', 
        'c_arrest_date', 
        'r_offense_date', 
        'r_jail_in', 
        'r_jail_out', 
        'vr_offense_date',
        'v_screening_date'
    ],
    axis='columns',
    inplace=True
)

Check dtypes again, should be no dates remaining

In [None]:
df.dtypes.value_counts()

Still a few floats, see what that is all about

In [None]:
df.select_dtypes(include=['float64'])

At least two look like only NaNs, check to be sure

In [None]:
print(df['num_vr_cases'].value_counts())
print(df['num_r_cases'].value_counts())

Yeah, NaNs only. We can drop those.

In [None]:
df.drop([
        'num_r_cases',
        'num_vr_cases'
    ],
    axis='columns',
    inplace=True
)

Replace other NaNs with column means (imputation) and check the columns again

In [None]:
df.fillna(df.mean(), inplace=True)
df.select_dtypes(include=['float64'])

Should be no more NaNs to go, so we can continue to learning.

Split to target and explanatory variables

In [None]:
predicted_variable = 'decile_score'
X = df.loc[:, df.columns != predicted_variable]
y = df.loc[:, df.columns == predicted_variable]
print(X.shape, y.shape)

Split to train and test sets

In [None]:
from sklearn import model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=42)

Train a linear classifier and check the $R^2$ score. Score of 1 is best possible, values can range to $-\inf$. We do a 10-fold cross validation to get a better sense of the model fit.

In [None]:
from sklearn import linear_model

model = linear_model.LinearRegression(normalize=True)
scores = model_selection.cross_val_score(model, X, y, cv=5)
scores

In [None]:
from sklearn import svm

model = svm.SVR()
scores = model_selection.cross_val_score(model, X, y.values.ravel(), cv=5, n_jobs=2)
scores