In [37]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import cross_val_predict, StratifiedKFold, GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
sentences = pd.read_csv('/users/nick/desktop/sentences.csv')
sentences.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# If you haven't looked at the EDA file -- principal analysis will be on 2008 - 2012 data, with sentences limited
# to 'new commitments,' looking only at the black and white inmate populations.  
five = sentences.loc[(sentences.ADMITYR >= 2008) & (sentences.ADMITYR <= 2012) & (sentences.admtype_ == 'New commitment')].copy()
five.reset_index(drop=True, inplace=True)

# As said in the EDA file: most states have some quantity of missing data, but it is particularly glaring w/r/t 
# Alabama's prison demographics. Research reveals that Alabama's 'Missing' population is a close match with 
# its known population of black inmates.
for i, e in enumerate(five.race_):
    if five.loc[i, 'state_'] == 'AL':
        if e == 'Missing':
            five.loc[i, 'race_'] = 'Black'
            
race = five.loc[(five.race_ == 'Black') | (five.race_ == 'White')].copy()
race.dropna(subset=['proj_time_served'], how='any', inplace=True)
race.race_ = race.race_.apply(lambda x: 1 if x == 'Black' else 0)
race.reltype_ = race.reltype_.apply(lambda x: 1 if x == 'Conditional release' else 0)
race.reset_index(drop=True, inplace=True)

In [None]:
race.to_csv('/users/nick/desktop/race.csv')

In [23]:
race = pd.read_csv('/users/nick/desktop/race.csv')
race.drop('Unnamed: 0', axis=1, inplace=True)

When modeling on unbalanced data, it's important to 'stratify' -- to make sure that the actual distribution of classes (in this case, the black inmate population and the white inmate population) is maintained across the training and testing data. (If class proportion is different in training data than testing data, the model will suffer.) But I'd also like to make sure that states are properly represented in both training and testing data. Since some states issue far more sentences than others, dummy variables don't offer a perfect solution: there's still a chance that (e.g.) Delaware's data falls primarily in the testing set, and so its sentences are predicted by sentences in other states. And so rather than build one model for all states, I'm building one model for each state. 

For modelling, I use an "Extreme Gradient Boosting" classifier ... like sklearn's GBM but better engineered (http://xgboost.readthedocs.io/en/latest/). Since I'm also optimizing each model, I use 'nested cross-validation' as protection against overfitting. See: http://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html. 

In [29]:
xgb = XGBClassifier(n_estimators=200)
inner_cv = StratifiedKFold(n_splits = 4, shuffle=True, random_state = 1)
outer_cv = StratifiedKFold(n_splits = 4, shuffle=True, random_state = 2)
p_grid = {'max_depth': [3,4,5],
          'min_child_weight': [2, 5, 10]}

def prediction_compiler(states):
    
    true = []
    pred = []
    sts = []   
    
    for state in states:
        X = race.loc[race.state_ == state][['SEX', 'off_detail', 'age_admit', 'reltype_', 'proj_time_served']]
        X.iloc[:,:-2] = X.iloc[:,:-2].apply(LabelEncoder().fit_transform)
        y = race.loc[race.state_ == state].race_.values
        
        gs = GridSearchCV(estimator = xgb, param_grid = p_grid, cv = inner_cv)
        y_pred = cross_val_predict(gs, X, y, cv= outer_cv)
        
        true.extend(y)   
        pred.extend(y_pred)
        st = [state] * len(y)
        sts.extend(st)
        
    df = pd.DataFrame({ 'State': sts,
                        'true': true, 
                        'pred': pred })

    return df

In [30]:
states = race.state_.unique()

model = prediction_compiler(states)
model['match'] = model.pred + model.true
model.match = model.match.apply(lambda x: 1 if x == 2 or x == 0 else 0)

float(model.match.sum()) / len(model)

0.6745759412660004

The score to beat -- 'baseline' -- which you'd get by predicting that all inmates are white since they are the larger class, is 54.4%. 

It's worth mentioning that this process (building and optimizing 40+ models, which took ~5 hours to run, as opposed to building a single model, which only takes minutes) did not significantly improve predictive performance. A single model had 65.6% accuracy, while modeling for each state but skipping optimization had 66.5% accuracy.

In [31]:
trends = pd.DataFrame((model.groupby('State').match.sum() / model.groupby('State').match.count()) * 100)

trends['baseline'] = model.groupby('State').true.sum() / model.groupby('State').true.count()
trends.baseline = trends.baseline.apply(lambda x: (1 - x) * 100 if x < 0.5 else x * 100)
trends['difference'] = trends.match - trends.baseline
trends.difference = trends.difference.apply(lambda x: 0 if x < 0 else x)

trends.reset_index(inplace=True)
trends = trends.iloc[1:,:]
trends.reset_index(drop=True, inplace=True)

In [32]:
trends

Unnamed: 0,State,match,baseline,difference
0,AL,61.763455,51.415094,10.348361
1,AZ,76.93232,76.711623,0.220696
2,CA,61.446365,53.512204,7.934161
3,CO,74.585799,73.988166,0.597633
4,DC,98.682064,98.682064,0.0
5,DE,57.450319,52.363871,5.086448
6,FL,65.885397,52.465869,13.419528
7,GA,66.082741,62.984344,3.098397
8,IA,73.570552,72.924335,0.646217
9,IL,67.881094,64.085398,3.795696


In [36]:
trends.to_csv('/users/nick/desktop/trends.csv')