### Prosecution severity
##### Data:
1. case_offense_v01.csv (merged in bonds_analysis_01.ipynb, but ideally can be merged anywhere?)
2. ACIS_offence_codes_FIXED.csv (from the 00_raw_data folder that Clarissa emailed)
 
##### Steps
1. Load Data 
    * Load the files listed above
 
2. Merge charged and convicted codes with code descriptions
    * Rename columns in the ACIS_offence_codes_FIXED df (offense_desc_final) with the charged prefix and merge with offenses. 
    * Rename columns in the ACIS_offence_codes_FIXED df (offense_desc_final) with the conv prefix and merge with previous output. 
 
3. Data Quality Fixes
    * Fix Offense class strings ("1.0" to "1" etc.)
    * Filter for gender values (M/F/U). Drop X ~500 cases. 
 
4.  Determining Delta
    * Based on offense class
        * "delta_min" is the difference between "conv_CL_Rank_min" and "CL_Rank", i.e. The minimum convicted offense
        * "delta_max" is the difference between "conv_CL_Rank_max" and "CL_rank", i.e. The maximum convicted offense
    * In cases where the charged offense code matches the convicted offense code, set delta_min and delta_max to zero

#### 1. Load Data (takes ~4 mins)

In [2]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None) 
# I am blind without this
%config InlineBackend.figure_format = 'retina' 

### Load the required files
file_dir = "/workspaces/esg-controversy-tracker/nc_acis-main/20_intermediate_data"
offenses = pd.read_csv(os.path.join(file_dir, "case_offense_v01.csv"), low_memory=False, index_col=[0])
offense_desc_final = pd.read_csv(os.path.join(file_dir,'ACIS_offence_codes_FIXED.csv'))

# subset
offs = offenses[
    [
        "case_id",
        "date_of_birth",
        "key_county_num",
        "key_year",
        "court_type",
        "race",
        "sex",
        "process_served",
        "case_creation_date",
        "case_trial_date",
        "court_attorney_type",
        "bond_type",
        "bond_amount",
        "charged_offense_date",
        "charged_offense_code",
        "offense_class",
        "min_sentence",
        "convicted_offense_code",
        "disposition",
        "disposition_date",
    ]
]

offs.head(5)

### 2. Merge Charged and Convicted codes with Code Descriptions

In [None]:
# rename cols
charged_offense_desc = offense_desc_final.rename(
    columns={
        "Offense Description": "charg_Off_Desc",
        "CODE": "charg_CODE",
        "T": "charg_type",
        "NC General Statute": "charg_statute",
        "CL_min": "charg_CL_min",
        "CL_Rank_min": "charg_CL_Rank_min",
        "CL_max": "charg_CL_max",
        "CL_Rank_max": "charg_CL_Rank_max",
    },
)

# merge
offs_rank = pd.merge(
    left=offs,
    right=charged_offense_desc,
    left_on="charged_offense_code",
    right_on="charg_CODE",
    how="left",
    validate="m:m"
)

# Merge with offence code CONVICTED descriptions:
# rename cols
conv_offense_desc = offense_desc_final.rename(
    columns={
        "Offense Description": "conv_Off_Desc",
        "CODE": "conv_CODE",
        "T": "convc_type",
        "NC General Statute": "conv_statute",
        "CL_min": "conv_CL_min",
        "CL_Rank_min": "conv_CL_Rank_min",
        "CL_max": "conv_CL_max",
        "CL_Rank_max": "conv_CL_Rank_max",
    },
)

# merge
offs_rank = pd.merge(
    left=offs_rank,
    right=conv_offense_desc,
    left_on="convicted_offense_code",
    right_on="conv_CODE",
    how="left",
    validate="m:m"
)
offs_rank.head()

Unnamed: 0,case_id,date_of_birth,key_county_num,key_year,court_type,race,sex,process_served,case_creation_date,case_trial_date,court_attorney_type,bond_type,bond_amount,charged_offense_date,charged_offense_code,offense_class,min_sentence,convicted_offense_code,disposition,disposition_date,charge_count,Unnamed: 0_x,charg_CODE,charg_Off_Desc,charg_type,charg_statute,charg_CL_min,charg_CL_Rank_min,charg_CL_max,charg_CL_Rank_max,Unnamed: 0_y,conv_CODE,conv_Off_Desc,convc_type,conv_statute,conv_CL_min,conv_CL_Rank_min,conv_CL_max,conv_CL_Rank_max
0,1973006239,7f69fac81ed85a6a41996f079cf9adcbd1ea89098e756a...,0,73,CRS,W,M,W,2013-10-22,1974-05-09,R,SEC,10000.0,1973-05-20,3599,,,,SI,1973-08-07,1,786.0,3599.0,DANGEROUS DRUGS - FREE TEXT,,,??,,??,,,,,,,,,,
1,1973006239,7f69fac81ed85a6a41996f079cf9adcbd1ea89098e756a...,0,73,CRS,W,M,W,2013-10-22,1974-05-09,R,SEC,10000.0,1973-05-20,3599,,,,VD,2013-10-18,1,786.0,3599.0,DANGEROUS DRUGS - FREE TEXT,,,??,,??,,,,,,,,,,
2,1973007828,7f69fac81ed85a6a41996f079cf9adcbd1ea89098e756a...,0,73,CRS,W,M,W,2013-10-22,1974-05-09,R,SEC,1.0,1973-05-19,3599,,,,SI,1973-08-07,1,786.0,3599.0,DANGEROUS DRUGS - FREE TEXT,,,??,,??,,,,,,,,,,
3,1973007828,7f69fac81ed85a6a41996f079cf9adcbd1ea89098e756a...,0,73,CRS,W,M,W,2013-10-22,1974-05-09,R,SEC,1.0,1973-05-19,3599,,,,VD,2013-10-18,1,786.0,3599.0,DANGEROUS DRUGS - FREE TEXT,,,??,,??,,,,,,,,,,
4,1973013840,7f69fac81ed85a6a41996f079cf9adcbd1ea89098e756a...,0,73,CRS,W,M,W,2013-10-22,1974-05-09,R,SEC,3000.0,1973-10-13,3599,,,,SI,1974-01-22,1,786.0,3599.0,DANGEROUS DRUGS - FREE TEXT,,,??,,??,,,,,,,,,,


### 3. Data Quality Fixes

In [None]:
# lets fix the Offense Class column
offs_rank.loc[offs_rank["offense_class"] == "1.0", "offense_class"] = "1"
offs_rank.loc[offs_rank["offense_class"] == "2.0", "offense_class"] = "2"
offs_rank.loc[offs_rank["offense_class"] == "3.0", "offense_class"] = "3"
offs_rank.loc[offs_rank["offense_class"] == "i", "offense_class"] = "I"

offs_rank.loc[offs_rank['offense_class']==" ", 'offense_class']=np.nan 
offs_rank.loc[offs_rank['offense_class']=="??", 'offense_class']=np.nan


# Clean up sex:
offs_rank.sex.value_counts()
offs_rank = offs_rank[offs_rank["sex"].isin(["M", "F", "U"])] # get rid of the X category, only 500 offenses 
offs_rank.sample(5)

### 4. Determining delta (difference between convicted vs. charged crimes)

In [None]:
# create ranks for classes
class_ranks = {
    "0": 0,
    "3": 1,
    "2": 2,
    "1": 3,
    "A1": 4,
    "I": 5,
    "H": 6,
    "G": 7,
    "F": 8,
    "E": 9,
    "D": 10,
    "C": 11,
    "B2": 12,
    "B1": 13,
    "A": 14,
    "??": np.nan,
}
class_ranks = pd.DataFrame.from_dict(class_ranks, "index").reset_index()
class_ranks.rename(columns={0: "CL_Rank"}, inplace=True)

#merge
offs_rank = pd.merge(offs_rank, class_ranks, left_on="offense_class", right_on="index", how="left")  # now, 'CL_rank' is the rank of the charged offense class provided by the court

# NOW lets look at deltas at the offense level for a second
offs_rank["delta_min"] = (offs_rank["conv_CL_Rank_min"] - offs_rank["CL_Rank"])  
#
# DELTA = final - initial, Delta MIN is the one using the minimum ranking of the convicted offense (when it can belong in two or more classes
# This means delta_min will be probably a larger delta than delta max
#
offs_rank["delta_max"] = offs_rank["conv_CL_Rank_max"] - offs_rank["CL_Rank"]

# if codes are the same, fill NaN with 0
offs_rank.loc[
    offs_rank["charged_offense_code"] == offs_rank["convicted_offense_code"],
    "delta_min",
] = 0
offs_rank.loc[
    offs_rank["charged_offense_code"] == offs_rank["convicted_offense_code"],
    "delta_max",
] = 0
offs_rank.sample(5)


Unnamed: 0,case_id,date_of_birth,key_county_num,key_year,court_type,race,sex,process_served,case_creation_date,case_trial_date,court_attorney_type,bond_type,bond_amount,charged_offense_date,charged_offense_code,offense_class,min_sentence,convicted_offense_code,disposition,disposition_date,charge_count,Unnamed: 0_x,charg_CODE,charg_Off_Desc,charg_type,charg_statute,charg_CL_min,charg_CL_Rank_min,charg_CL_max,charg_CL_Rank_max,Unnamed: 0_y,conv_CODE,conv_Off_Desc,convc_type,conv_statute,conv_CL_min,conv_CL_Rank_min,conv_CL_max,conv_CL_Rank_max,index,CL_Rank,delta_min,delta_max
10872292,4802017056277,0f3ec40d5051af18ee0d72ab396bc7e3c81bd78311a122...,480,17,CR,W,M,M,2017-11-03,2018-07-16,A,SEC,6000.0,2017-11-03,3470,,,,VD,2018-07-16,1,1927.0,3470.0,POSS MARIJ >1/2 TO 1 1/2 OZ,M,90-95(D)(4),1,3.0,1,3.0,,,,,,,,,,,,,
18122856,7802017701227,f80905b557ae96e26f8cfd963ecc7650984b9097b8cbcd...,780,17,CR,W,F,C,2017-02-28,2017-04-26,R,,,2017-02-28,5450,,,4418.0,JU,2017-04-26,1,1300.0,5450.0,SPEEDING,T,20-141(J1),2,2.0,3,1.0,1068.0,4418.0,IMPROPER EQUIP - SPEEDOMETER,I,20-123.2,0.0,0.0,0.0,0.0,,,,
7321487,3302019722913,02299a7c3b1745ca30e42cd4b445f9b816aee7b7850781...,330,19,CR,W,M,C,2019-06-21,2021-05-25,R,,,2019-06-20,4725,,,,VD,2021-05-25,1,1475.0,4725.0,DWLR NOT IMPAIRED REV,T,20-28(A),3,1.0,3,1.0,,,,,,,,,,,,,
17177135,7502016052513,77e1240dd48cfa18d80845b024136a12cc2f95613bfc46...,750,16,CRS,W,F,W,2016-06-03,2017-09-11,A,SEC,10000.0,2016-05-18,2356,,,,WP,2016-06-15,1,247.0,2356.0,LARCENY AFTER BREAK/ENTER,F,14-72(B)(2),H,6.0,H,6.0,,,,,,,,,,,,,
10412621,4402020050435,e345ed6b8e98b0da17ad46e40aa7adc60397ad9659e8fe...,440,20,CR,W,M,W,2020-01-26,2020-08-10,P,SEC,500.0,2020-01-26,1368,,,,VD,2020-08-10,1,1685.0,1368.0,SIMPLE ASSAULT,M,14-33(A),2,2.0,1,3.0,,,,,,,,,,,,,


In [None]:
offs_rank['delta_min'].value_counts()

In [None]:
offs_rank['delta_max'].value_counts()