# Are black people more frequently entered into the system in some NC Counties?

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
## Read data from Case Offense Description CSV
data_chunks = pd.read_csv(
    "/Users/preetkhowaja/Desktop/qsides/10_cleaned_data/case_offense_desc.csv.zip",
    chunksize=100000,
    usecols=[
        "case_id",
        "disposition",
        'key_county_num',
        "def_zip_address",
        "race",
        "sex",
        "charged_offense_code",
        "prior_record_points",
        "Offense Description",
    ],
)

chunk_list = []

for chunk in data_chunks:
    chunk_list.append(chunk)

full_data = pd.concat(chunk_list)

In [12]:
## We need to find the defendant's county address
full_data.def_zip_address.isna().sum()

767666

In [13]:
full_data.head()

Unnamed: 0,case_id,key_county_num,def_zip_address,race,sex,charged_offense_code,disposition,prior_record_points,Offense Description
0,1973006239,0,27253,W,M,3599,SI,,DANGEROUS DRUGS - FREE TEXT
1,1973006239,0,27253,W,M,3599,VD,,DANGEROUS DRUGS - FREE TEXT
2,1973007828,0,27253,W,M,3599,SI,,DANGEROUS DRUGS - FREE TEXT
3,1973007828,0,27253,W,M,3599,VD,,DANGEROUS DRUGS - FREE TEXT
4,1973013840,0,27253,W,M,3599,SI,,DANGEROUS DRUGS - FREE TEXT


In [14]:
full_data.shape

(31154060, 9)

In [16]:
## Check if the case_id is unique
full_data.case_id.is_unique

False

In [17]:
## We drop duplicates missing values for defendent zip address and then drop duplicates in case id
complete_data = full_data.loc[~full_data.def_zip_address.isna()]
assert complete_data.shape[0] == (31154060 - 767666)
assert complete_data.def_zip_address.isna().sum() == 0

In [None]:
complete_data.drop_duplicates(subset='case_id', inplace=True)
complete_data.case_id.is_unique

In [20]:
## A quick look at the shape of our dataset
complete_data.shape

(13645126, 9)

### We have 13.6 million cases from North Carolina

In [22]:
# Read in zip address -> county csv
zip_to_county = pd.read_excel('/Users/preetkhowaja/Desktop/personal_qsides/ZIP_COUNTY_122021.xlsx')
zip_to_county.head()

Unnamed: 0,zip,county,usps_zip_pref_city,usps_zip_pref_state,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,683,72125,SAN GERMAN,PR,0.953036,0.996656,0.980769,0.9563
1,683,72079,SAN GERMAN,PR,0.000949,0.0,0.0,0.000873
2,683,72023,SAN GERMAN,PR,0.000791,0.001115,0.0,0.0008
3,683,72097,SAN GERMAN,PR,0.000158,0.0,0.0,0.000145
4,683,72121,SAN GERMAN,PR,0.045066,0.00223,0.019231,0.041882


In [23]:
## Filter just to NC counties
zip_to_county = zip_to_county.loc[zip_to_county.usps_zip_pref_state == 'NC']

In [24]:
## Rename columns before merging
zip_to_county.rename(columns={'county': 'def_county_address', 'usps_zip_pref_city': 'def_city_name'}, inplace = True)
zip_to_county.head()

Unnamed: 0,zip,def_county_address,def_city_name,usps_zip_pref_state,res_ratio,bus_ratio,oth_ratio,tot_ratio
215,27292,37057,LEXINGTON,NC,0.997892,1.0,1.0,0.998146
216,27292,37151,LEXINGTON,NC,0.002108,0.0,0.0,0.001854
217,27529,37101,GARNER,NC,0.283565,0.184787,0.265421,0.274081
218,27529,37183,GARNER,NC,0.716435,0.815213,0.734579,0.725919
219,27532,37191,GOLDSBORO,NC,1.0,1.0,1.0,1.0


In [26]:
zip_to_county = zip_to_county[['zip', 'def_county_address']]
zip_to_county.sample()

Unnamed: 0,zip,def_county_address
32796,27896,37127


In [27]:
## reduce columns from original dataset
complete_data = complete_data[['def_zip_address', 'race', 'sex']]

In [68]:
zip_county_data = pd.merge(complete_data, zip_to_county, left_on='def_zip_address', right_on = 'zip', how='left', indicator=True)

In [69]:
## we don't have a complete match because some defendants are from out of state
zip_county_data._merge.value_counts(normalize=True)

both          0.951911
left_only     0.048089
right_only    0.000000
Name: _merge, dtype: float64

In [70]:
zip_county_data = zip_county_data.loc[zip_county_data._merge =='both']


In [71]:
zip_county_data = zip_county_data[['race', 'sex', 'def_zip_address', 'def_county_address']]
zip_county_data.head()

Unnamed: 0,race,sex,def_zip_address,def_county_address
3791,H,M,27253.0,37001.0
3792,H,M,27217.0,37033.0
3793,H,M,27217.0,37001.0
3794,H,M,27215.0,37001.0
3795,H,M,27215.0,37081.0


In [72]:
## Let's group the data into white and non-white/black because that's how it's recorded in NHGIS
zip_county_data.race.value_counts()

W    10323861
B     8134847
H     2062418
O      432478
I      332945
U      200098
A      107037
X         483
Name: race, dtype: int64

Defendant Race Code
     A - Asian
     B - Black
     H - Hispanic
     I - Indian
     O - Other
     U - Unknown
     W - White
     X - Non-person
     Space - Unknown

In [73]:
## Now that we have each offense's race by county, we want to group by county and get the proportions of race in each

## # Group the dataframe by county and race, and count the number of rows in each group
grouped = zip_county_data.groupby(['def_county_address', 'race']).size()

# Convert the grouped Series to a DataFrame and reset the index
grouped_df = grouped.to_frame(name='count').reset_index()

# Pivot the grouped DataFrame to have one row per county and columns for each race
pivoted_df = grouped_df.pivot(index='def_county_address', columns='race', values='count')

# Calculate the total number of people in each county
pivoted_df['total'] = pivoted_df.sum(axis=1)

# Limiting data to W and Non White
pivoted_df = pivoted_df[['B', 'W', 'H', 'total']]

# Calculate the percentage of each race in each county
for race in pivoted_df.columns[:-1]:
    pivoted_df[race + '_perc'] = pivoted_df[race] / pivoted_df['total'] * 100

# Print the resulting dataframe
#print(pivoted_df)


In [74]:
## Perfect because we have information for all 100 counties in NC
pivoted_df.shape

(100, 7)

In [75]:
pivoted_df = pivoted_df.reset_index()
pivoted_df.head()

race,def_county_address,B,W,H,total,B_perc,W_perc,H_perc
0,37001.0,81148.0,114318.0,35411.0,239406.0,33.895558,47.750683,14.791192
1,37003.0,26648.0,126218.0,13759.0,171351.0,15.551704,73.660498,8.029717
2,37005.0,297.0,9842.0,2433.0,13160.0,2.256839,74.787234,18.487842
3,37007.0,38169.0,20457.0,2030.0,61914.0,61.648416,33.040992,3.278741
4,37009.0,546.0,26057.0,2550.0,29831.0,1.830311,87.348731,8.548155


In [76]:
race_in_system = pivoted_df.rename(columns={'def_county_address': 'county_fips_code','W': 'white_count_sys', 'B': 'black_count_sys', 'H': 'hisp_count_sys',
                                            'total': 'total_count_sys' ,'W_perc': 'white_perc_sys', 'B_perc': 'black_perc_sys', 'H_perc': 'hisp_perc_sys'})             
race_in_system.head()

race,county_fips_code,black_count_sys,white_count_sys,hisp_count_sys,total_count_sys,black_perc_sys,white_perc_sys,hisp_perc_sys
0,37001.0,81148.0,114318.0,35411.0,239406.0,33.895558,47.750683,14.791192
1,37003.0,26648.0,126218.0,13759.0,171351.0,15.551704,73.660498,8.029717
2,37005.0,297.0,9842.0,2433.0,13160.0,2.256839,74.787234,18.487842
3,37007.0,38169.0,20457.0,2030.0,61914.0,61.648416,33.040992,3.278741
4,37009.0,546.0,26057.0,2550.0,29831.0,1.830311,87.348731,8.548155


## Population Race breakdown from IPUMS

I'm using 2013-17 data, because other datasets don't have all counties or information on hispanic/race overlaps 

In [92]:
pop_data = pd.read_csv('/Users/preetkhowaja/Desktop/nhgis0009_csv/nhgis0009_ds233_20175_county.csv', encoding='latin-1')
pop_data.head()

Unnamed: 0,GISJOIN,YEAR,STUSAB,REGIONA,DIVISIONA,STATE,STATEA,COUNTY,COUNTYA,COUSUBA,...,AHZAM012,AHZAM013,AHZAM014,AHZAM015,AHZAM016,AHZAM017,AHZAM018,AHZAM019,AHZAM020,AHZAM021
0,G0100010,2013-2017,AL,,,Alabama,1,Autauga County,1,,...,0,325,90,27,27,34,356,31,31,27
1,G0100030,2013-2017,AL,,,Alabama,1,Baldwin County,3,,...,0,646,200,150,16,27,505,228,210,79
2,G0100050,2013-2017,AL,,,Alabama,1,Barbour County,5,,...,0,152,39,12,19,21,160,9,9,21
3,G0100070,2013-2017,AL,,,Alabama,1,Bibb County,7,,...,0,52,35,21,21,21,16,37,21,37
4,G0100090,2013-2017,AL,,,Alabama,1,Blount County,9,,...,0,222,43,29,27,27,184,138,36,132


In [93]:
pop_data = pop_data.loc[pop_data.STATE == 'North Carolina']
pop_data.COUNTY.shape

(100,)

In [94]:
pop_data.columns

Index(['GISJOIN', 'YEAR', 'STUSAB', 'REGIONA', 'DIVISIONA', 'STATE', 'STATEA',
       'COUNTY', 'COUNTYA', 'COUSUBA', 'PLACEA', 'TRACTA', 'BLKGRPA',
       'CONCITA', 'AIANHHA', 'RES_ONLYA', 'TRUSTA', 'AIHHTLI', 'AITSCEA',
       'ANRCA', 'CBSAA', 'CSAA', 'METDIVA', 'NECTAA', 'CNECTAA', 'NECTADIVA',
       'UAA', 'CDCURRA', 'SLDUA', 'SLDLA', 'ZCTA5A', 'SUBMCDA', 'SDELMA',
       'SDSECA', 'SDUNIA', 'PCI', 'PUMAA', 'GEOID', 'BTTRA', 'BTBGA', 'NAME_E',
       'AHZAE001', 'AHZAE002', 'AHZAE003', 'AHZAE004', 'AHZAE005', 'AHZAE006',
       'AHZAE007', 'AHZAE008', 'AHZAE009', 'AHZAE010', 'AHZAE011', 'AHZAE012',
       'AHZAE013', 'AHZAE014', 'AHZAE015', 'AHZAE016', 'AHZAE017', 'AHZAE018',
       'AHZAE019', 'AHZAE020', 'AHZAE021', 'NAME_M', 'AHZAM001', 'AHZAM002',
       'AHZAM003', 'AHZAM004', 'AHZAM005', 'AHZAM006', 'AHZAM007', 'AHZAM008',
       'AHZAM009', 'AHZAM010', 'AHZAM011', 'AHZAM012', 'AHZAM013', 'AHZAM014',
       'AHZAM015', 'AHZAM016', 'AHZAM017', 'AHZAM018', 'AHZAM019', 'AHZAM

In [95]:
## the state fips code for NC is 37
pop_data.STATEA.value_counts()

37    100
Name: STATEA, dtype: int64

In [96]:
## Here we create a FIPS code column 
pop_data.COUNTYA = pop_data.COUNTYA.astype(str).str.zfill(3)
pop_data['county_fips_code'] = pop_data.STATEA.astype(str) + pop_data.COUNTYA.astype(str)

In [97]:
pop_data.county_fips_code.value_counts()

37001    1
37127    1
37147    1
37145    1
37143    1
        ..
37061    1
37059    1
37057    1
37055    1
37199    1
Name: county_fips_code, Length: 100, dtype: int64

In [98]:
pop_data.columns

Index(['GISJOIN', 'YEAR', 'STUSAB', 'REGIONA', 'DIVISIONA', 'STATE', 'STATEA',
       'COUNTY', 'COUNTYA', 'COUSUBA', 'PLACEA', 'TRACTA', 'BLKGRPA',
       'CONCITA', 'AIANHHA', 'RES_ONLYA', 'TRUSTA', 'AIHHTLI', 'AITSCEA',
       'ANRCA', 'CBSAA', 'CSAA', 'METDIVA', 'NECTAA', 'CNECTAA', 'NECTADIVA',
       'UAA', 'CDCURRA', 'SLDUA', 'SLDLA', 'ZCTA5A', 'SUBMCDA', 'SDELMA',
       'SDSECA', 'SDUNIA', 'PCI', 'PUMAA', 'GEOID', 'BTTRA', 'BTBGA', 'NAME_E',
       'AHZAE001', 'AHZAE002', 'AHZAE003', 'AHZAE004', 'AHZAE005', 'AHZAE006',
       'AHZAE007', 'AHZAE008', 'AHZAE009', 'AHZAE010', 'AHZAE011', 'AHZAE012',
       'AHZAE013', 'AHZAE014', 'AHZAE015', 'AHZAE016', 'AHZAE017', 'AHZAE018',
       'AHZAE019', 'AHZAE020', 'AHZAE021', 'NAME_M', 'AHZAM001', 'AHZAM002',
       'AHZAM003', 'AHZAM004', 'AHZAM005', 'AHZAM006', 'AHZAM007', 'AHZAM008',
       'AHZAM009', 'AHZAM010', 'AHZAM011', 'AHZAM012', 'AHZAM013', 'AHZAM014',
       'AHZAM015', 'AHZAM016', 'AHZAM017', 'AHZAM018', 'AHZAM019', 'AHZAM

In [99]:
## Let's filter this dataset to only what we need
pop_data = pop_data[['county_fips_code', 'COUNTY', 'AHZAE001', 'AHZAE002', 'AHZAE003', 'AHZAE004', 'AHZAE005', 'AHZAE006',
       'AHZAE007', 'AHZAE008', 'AHZAE009', 'AHZAE010', 'AHZAE011', 'AHZAE012',
       'AHZAE013', 'AHZAE014', 'AHZAE015', 'AHZAE016', 'AHZAE017', 'AHZAE018',
       'AHZAE019', 'AHZAE020', 'AHZAE021', 'NAME_M', 'AHZAM001', 'AHZAM002',
       'AHZAM003', 'AHZAM004', 'AHZAM005', 'AHZAM006', 'AHZAM007', 'AHZAM008',
       'AHZAM009', 'AHZAM010', 'AHZAM011', 'AHZAM012', 'AHZAM013', 'AHZAM014',
       'AHZAM015', 'AHZAM016', 'AHZAM017', 'AHZAM018', 'AHZAM019', 'AHZAM020',
       'AHZAM021',]]

In [101]:
pop_data.AHZAE001.value_counts()

157844    1
94125     1
176484    1
39240     1
13506     1
         ..
59350     1
41766     1
164118    1
35412     1
17605     1
Name: AHZAE001, Length: 100, dtype: int64

#### Now we rename and recode the races in this dataset. The encoding is as follows:
Table 1:     Hispanic or Latino Origin by Race
    Universe:    Total population
    Source code: B03002
    NHGIS code:  AHZA
        AHZAE001:    Total
        AHZAE002:    Not Hispanic or Latino
        AHZAE003:    Not Hispanic or Latino: White alone
        AHZAE004:    Not Hispanic or Latino: Black or African American alone
        AHZAE005:    Not Hispanic or Latino: American Indian and Alaska Native alone
        AHZAE006:    Not Hispanic or Latino: Asian alone
        AHZAE007:    Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander alone
        AHZAE008:    Not Hispanic or Latino: Some other race alone
        AHZAE009:    Not Hispanic or Latino: Two or more races
        AHZAE010:    Not Hispanic or Latino: Two or more races: Two races including Some other race
        AHZAE011:    Not Hispanic or Latino: Two or more races: Two races excluding Some other race, and three or more races
        AHZAE012:    Hispanic or Latino
        AHZAE013:    Hispanic or Latino: White alone
        AHZAE014:    Hispanic or Latino: Black or African American alone
        AHZAE015:    Hispanic or Latino: American Indian and Alaska Native alone
        AHZAE016:    Hispanic or Latino: Asian alone
        AHZAE017:    Hispanic or Latino: Native Hawaiian and Other Pacific Islander alone
        AHZAE018:    Hispanic or Latino: Some other race alone
        AHZAE019:    Hispanic or Latino: Two or more races
        AHZAE020:    Hispanic or Latino: Two or more races: Two races including Some other race
        AHZAE021:    Hispanic or Latino: Two or more races: Two races excluding Some other race, and three or more races
 

Creating new columns for population based on these encodings:
- Black: Hispanic Black and Non-Hispanic Black
- White: Non-hispanic white
- Hispanic: Hispanic white, and other hispanic, excluding hispanic black

In [137]:
pop_data['total'] = pop_data.AHZAE001
pop_data['white_pop'] = pop_data.AHZAE003
pop_data['black_pop'] = (pop_data.AHZAE004 + pop_data.AHZAE014)
pop_data['hisp_pop'] = (
    pop_data.AHZAE013
    + pop_data.AHZAE015
    + pop_data.AHZAE016
    + pop_data.AHZAE017
    + pop_data.AHZAE018
    + pop_data.AHZAE019
    + pop_data.AHZAE020
    + pop_data.AHZAE021
)
pop_data['other_pop'] = pop_data['total'] - (pop_data['white_pop'] + pop_data['black_pop'] + pop_data['hisp_pop'])
population_race_counts = pop_data[['county_fips_code','COUNTY','total', 'black_pop', 'white_pop', 'hisp_pop', 'other_pop']]

In [None]:
# Calculate the percentage of each race in each county
for race in population_race_counts.columns[2:]:
   population_race_counts[race + '_perc'] = population_race_counts[race] / population_race_counts['total'] * 100

# Print the resulting dataframe
population_race_counts


# Concat both datasets


In [148]:
## Race in system
race_in_system.county_fips_code.dtype

dtype('float64')

In [150]:
population_race_counts.county_fips_code = population_race_counts.county_fips_code.astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population_race_counts.county_fips_code = population_race_counts.county_fips_code.astype(float)


In [151]:
population_race_counts.head()

Unnamed: 0,county_fips_code,COUNTY,total,black_pop,white_pop,hisp_pop,other_pop,total_perc,black_pop_perc,white_pop_perc,hisp_pop_perc,other_pop_perc
1890,37001.0,Alamance County,157844,29972,102558,20236,5078,100.0,18.988368,64.974278,12.820253,3.2171
1891,37003.0,Alexander County,37159,2187,32335,1750,887,100.0,5.885519,87.01795,4.709492,2.387039
1892,37005.0,Alleghany County,10935,266,9549,1062,58,100.0,2.432556,87.325103,9.711934,0.530407
1893,37007.0,Anson County,25531,12417,11463,982,669,100.0,48.634993,44.898359,3.846304,2.620344
1894,37009.0,Ashe County,26833,210,24810,1545,268,100.0,0.782618,92.460776,5.757836,0.99877


In [152]:
full_data_table = pd.merge(race_in_system, population_race_counts, on='county_fips_code', how='outer')
full_data_table.head()

Unnamed: 0,county_fips_code,black_count_sys,white_count_sys,hisp_count_sys,total_count_sys,black_perc_sys,white_perc_sys,hisp_perc_sys,COUNTY,total,black_pop,white_pop,hisp_pop,other_pop,total_perc,black_pop_perc,white_pop_perc,hisp_pop_perc,other_pop_perc
0,37001.0,81148.0,114318.0,35411.0,239406.0,33.895558,47.750683,14.791192,Alamance County,157844,29972,102558,20236,5078,100.0,18.988368,64.974278,12.820253,3.2171
1,37003.0,26648.0,126218.0,13759.0,171351.0,15.551704,73.660498,8.029717,Alexander County,37159,2187,32335,1750,887,100.0,5.885519,87.01795,4.709492,2.387039
2,37005.0,297.0,9842.0,2433.0,13160.0,2.256839,74.787234,18.487842,Alleghany County,10935,266,9549,1062,58,100.0,2.432556,87.325103,9.711934,0.530407
3,37007.0,38169.0,20457.0,2030.0,61914.0,61.648416,33.040992,3.278741,Anson County,25531,12417,11463,982,669,100.0,48.634993,44.898359,3.846304,2.620344
4,37009.0,546.0,26057.0,2550.0,29831.0,1.830311,87.348731,8.548155,Ashe County,26833,210,24810,1545,268,100.0,0.782618,92.460776,5.757836,0.99877


In [154]:
full_data_table.to_csv('county_racial_representation_table.csv', index=False)