In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

The data for this project, which was compiled by the Bureau of Justice Statistics, is available here: http://www.icpsr.umich.edu/icpsrweb/NACJD/studies/36404?fundingAgency=United+States+Department+of+Justice.+Office+of+Justice+Programs.+Bureau+of+Justice+Statistics&dataFormat%5B0%5D=Delimited&q=&sortBy=5&paging.startRow=1

In [2]:
sentences = pd.read_csv('/users/nick/desktop/TermRecords.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


Much of the data is label-encoded. I want columns with unencoded values so that I can easier
interpret what's going on. (This will require using the codebook that accompanies this dataset.) The columns I create will have lowercase headers. 

In [3]:
offgeneral_map = {1: 'Violent',
                  2: 'Property',
                  3: 'Drugs',
                  4: 'Public order',
                  5: 'Other',
                  9: 'Missing'}

sentences['off_general'] = sentences.OFFGENERAL.map(offgeneral_map)

In [4]:
offdetail_map = {1: 'Murder',
                 2: 'Negligent manslaughter',
                 3: 'Rape/Sexual Assault',
                 4: 'Robbery',
                 5: 'Aggravated or simple assault',
                 6: 'Other violent offences',
                 7: 'Burglary',
                 8: 'Larceny',
                 9: 'Motor vehicle theft',
                 10: 'Fraud',
                 11: 'Other property offenses',
                 12: 'Drugs',
                 13: 'Public order',
                 14: 'Other',
                 99: 'Missing'}

sentences['off_detail'] = sentences.OFFDETAIL.map(offdetail_map)

In [5]:
state_map = {1: 'AL',
             2: 'AK',
             4: 'AZ',
             5: 'AR', 
             6: 'CA', 
             8: 'CO',
             9: 'CN',
             10: 'DE',
             11: 'DC',
             12: 'FL',
             13: 'GA',
             15: 'HI',
             16: 'ID',
             17: 'IL', 
             18: 'IN',
             19: 'IA',
             20: 'KS',
             21: 'KY',
             22: 'LA',
             23: 'ME',
             24: 'MD',
             25: 'MA',
             26: 'MI',
             27: 'MN',
             28: 'MS',
             29: 'MO',
             30: 'MT',
             31: 'NE',
             32: 'NV',
             33: 'NH',
             34: 'NJ',
             35: 'NM',
             36: 'NY',
             37: 'NC',
             38: 'ND',
             39: 'OH',
             40: 'OK',
             41: 'OR',
             42: 'PA',
             44: 'RI',
             45: 'SC',
             46: 'SD',
             47: 'TN',
             48: 'TX',
             49: 'UT',
             50: 'VT',
             51: 'VA',
             52: 'Shared Jurisdiction',
             53: 'WA',
             54: 'WV',
             55: 'WI',
             56: 'WY',
             60: 'State Total',
             70: 'State & Federal Total',
             99: 'Federal BOP'
            }

sentences['state_'] = sentences.STATE.map(state_map)

In [6]:
race_map = {1: 'White',
            2: 'Black',
            3: 'Hispanic',
            4: 'Other',
            9: 'Missing'}

sentences['race_'] = sentences.RACE.map(race_map)

In [7]:
admission_type_map = {1: 'New commitment',
                 2: 'Parole return/revocation',
                 3: 'Other',
                 9: 'Missing'}

sentences['admtype_'] = sentences.ADMTYPE.map(admission_type_map)

In [8]:
sentences.AGERELEASE = sentences.AGERELEASE.apply(pd.to_numeric, errors='ignore')
sentences.AGERELEASE = sentences.AGERELEASE.apply(lambda x: 9 if isinstance(x, basestring) else x)

sentences.AGEADMIT = sentences.AGEADMIT.apply(pd.to_numeric, errors='ignore')
sentences.AGEADMIT = sentences.AGEADMIT.apply(lambda x: 9 if isinstance(x, basestring) else x)

age_map = {1: '18-24',
           2: '25-34',
           3: '35-44',
           4: '45-54',
           5: '55+',
           9: 'NA',}

sentences['age_admit'] = sentences.AGEADMIT.map(age_map)
sentences['age_release'] = sentences.AGERELEASE.map(age_map)

In [9]:
sentences.SENTLGTH = sentences.SENTLGTH.apply(pd.to_numeric, errors='ignore')
sentences.SENTLGTH = sentences.SENTLGTH.apply(lambda x: 9 if isinstance(x, basestring) else x)

sentlgth_map = {0: '<1',
                1: '1-1.9',
                2: '2-4.9',
                3: '5-9.9',
                4: '10-24.9',
                5: '>=25',
                6: 'Life',
                9: 'Missing'}

sentences['sentlgth_'] = sentences.SENTLGTH.map(sentlgth_map)

In [10]:
# Because all education values equal 9, I'm dropping education.
sentences.drop('EDUCATION', axis=1, inplace=True)

# Remapping sex from 1: male 2: female to 1: male 0: female.
sentences.SEX = sentences.SEX.apply(lambda x: 0 if x == 2 else x)

In [11]:
sentences.RELTYPE = sentences.RELTYPE.apply(pd.to_numeric, errors='ignore')
sentences.RELTYPE = sentences.RELTYPE.apply(lambda x: 9 if isinstance(x, basestring) else x)

release_type_map = {1: 'Conditional release',
                    2: 'Unconditional release',
                    3: 'Other',
                    9: 'Missing'}

sentences['reltype_'] = sentences.RELTYPE.map(release_type_map)

In [23]:
sentences.MAND_PRISREL_YEAR.replace(' ', np.nan, inplace=True)
sentences.PROJ_PRISREL_YEAR.replace(' ', np.nan, inplace=True)
sentences.PARELIG_YEAR.replace(' ', np.nan, inplace=True)

In [12]:
sentences.PROJ_PRISREL_YEAR = sentences.PROJ_PRISREL_YEAR.apply(pd.to_numeric, errors='ignore')
sentences.MAND_PRISREL_YEAR = sentences.MAND_PRISREL_YEAR.apply(pd.to_numeric, errors='ignore')
sentences.PARELIG_YEAR = sentences.PARELIG_YEAR.apply(pd.to_numeric, errors='ignore')

In [13]:
# To make calculations (such as the length of an average prison sentence), I need to convert sentence ranges (e.g.
# 10-25 years) to single numbers. So I inpute the ballpark median of each range. 
sentlgth_approx = { 0: 0, # '0' corresponds to a sentence < 1 year
                    1: 1, # '1' to a sentence 1-1.9 years 
                    2: 4, # '2' to a sentence 2-4.9 years
                    3: 7, # '3' to a sentence 5-9.9 years
                    4: 17, # '4' to a sentence 10-24.9 years
                    5: 30, # '5' to a sentence >= 25 years
                    6: 55, # '6' to a life sentence
                    9: np.nan} # '9' to a missing sentence length.

sentences['sent_approx'] = sentences.SENTLGTH.map(sentlgth_approx)

In [15]:
# I want to calculate a best guess for each prisoner's sentence length (or, if a sentence has been completed, the 
# actual length of that sentence). So, if the prisoner has been released, this will equal the release year minus
# the year admitted. If the prisoner is still incarcerated, it will equal the projected release year minus the year
# admitted, unless projected release year is not availble, in which case it will equal mandatory release year minus  
# the year admitted, unless mandatory release year is not available, in which case it will equal parole eligibility  
# year minus the year admitted (perhaps I should have switched the order, such that the year of parole elibility was
# checked before the mandatory release year), unless the year of parole eligibility is not available, in which 
# case it will equal the sentence length approximation calculated in the cell above.

def proj_time_served(dataframe):
    if 0 <= dataframe['RELEASEYR'] - dataframe['ADMITYR'] < 100 and dataframe['RELEASEYR'] != 9999:
        return dataframe['RELEASEYR'] - dataframe['ADMITYR']
    elif dataframe['RELEASEYR'] - dataframe['ADMITYR'] >= 100:
        try:
            if 0 <= dataframe['PROJ_PRISREL_YEAR'] - dataframe['ADMITYR'] < 100:
                return dataframe['PROJ_PRISREL_YEAR'] - dataframe['ADMITYR']
            elif dataframe['PROJ_PRISREL_YEAR'] - dataframe['ADMITYR'] > 100 and dataframe['PROJ_PRISREL_YEAR'] != 9997:
                return 60
        except TypeError: 
            try: 
                if 0 <= dataframe['MAND_PRISREL_YEAR'] - dataframe['ADMITYR'] < 100:
                    return dataframe['MAND_PRISREL_YEAR'] - dataframe['ADMITYR']
                elif dataframe['MAND_PRISREL_YEAR'] - dataframe['ADMITYR'] > 100 and dataframe['MAND_PRISREL_YEAR'] != 9997:
                    return 60
            except TypeError:
                try:
                    if 0 <= dataframe['PARELIG_YEAR'] - dataframe['ADMITYR'] < 100:
                        return dataframe['PARELIG_YEAR'] - dataframe['ADMITYR']
                    elif dataframe['PARELIG_YEAR'] - dataframe['ADMITYR'] > 100 and dataframe['PARELIG_YEAR'] != 9997:
                        return 60
                except TypeError:
                    return dataframe['sent_approx']            
    else:
        pass
    
sentences['proj_time_served'] = sentences.apply(proj_time_served, axis=1)

In [28]:
sentences.to_csv('/users/nick/desktop/sentences.csv')