# EDA & Data Cleaning
---
This notebook will be used to clean and explore the data.

In [71]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

---
## Load Data

In [74]:
# Read demographic data
demo = pd.read_excel('../data/combined_state_demo_data_2016_2022.xlsx', sheet_name = 'Data')

print(demo.shape)
demo.head()

(357, 18)


Unnamed: 0,state,year,total_pop,white_pop,black_pop,hispanic_pop,asian_pop,native_pop,islander_pop,multi_race_pop,median_income,poverty_rate,unemployment_rate,unemployed_15_weeks,labor_force_participation_rate,hs_grad_rate,bachelors_grad_rate,zhvi
0,AL,2022,4916000,0.645,0.252,0.048,0.014,0.002,,0.039,59910,13.6,2.5,1.2,57.0,88.8,28.8,217335.198947
1,AL,2021,4900800,0.644,0.255,0.047,0.013,0.004,,0.037,56930,15.9,3.4,2.0,56.6,87.9,27.4,193148.797902
2,AL,2020,0,0.0,0.0,0.0,0.0,0,0.0,0.0,54690,14.9,6.4,1.4,57.2,88.0,27.8,169855.574269
3,AL,2019,4767100,0.654,0.265,0.044,0.014,0.004,,0.019,56200,12.9,3.2,1.4,57.7,87.1,26.3,157202.180627
4,AL,2018,4752600,0.656,0.265,0.043,0.013,<.01,,0.019,49940,16.0,3.9,1.6,57.3,86.6,25.5,148927.509383


In [76]:
# Read crime data
crime = pd.read_csv('../data/transformed_crime_data.csv')

print(crime.shape)
crime.head()

(312, 7)


Unnamed: 0,State,Offender Age,Data Year,Crimes Against Society,Fraud and Other Financial Crimes,Property Crime,Violent Crime
0,AK,15-24,2021,164.5,38.5,424.5,651.0
1,AK,15-24,2022,156.5,37.5,422.0,648.0
2,AL,15-24,2016,99.0,21.0,79.5,117.0
3,AL,15-24,2017,165.0,13.5,58.5,95.5
4,AL,15-24,2018,33.0,15.5,37.0,52.5


In [78]:
# Read youth data
youth = pd.read_csv('../data/youth data.csv')

print(youth.shape)
youth.head()

(306, 6)


Unnamed: 0,Location,State Abbreviation,TimeFrame,Youtt not in School,Youth in Foster_care,Youth living in poverty
0,Alabama,AL,2022,73000,,240000
1,Alabama,AL,2021,86000,,245000
2,Alabama,AL,2019,74000,800.0,228000
3,Alabama,AL,2018,86000,829.0,255000
4,Alabama,AL,2017,88000,813.0,265000


---
## Merge Data

In [81]:
# merge demographic and crime data
combined = pd.merge(left = demo, right = crime, how = 'left', left_on = ['state','year'], right_on = ['State','Data Year'])

print(combined.shape)
combined.head()

(357, 25)


Unnamed: 0,state,year,total_pop,white_pop,black_pop,hispanic_pop,asian_pop,native_pop,islander_pop,multi_race_pop,...,hs_grad_rate,bachelors_grad_rate,zhvi,State,Offender Age,Data Year,Crimes Against Society,Fraud and Other Financial Crimes,Property Crime,Violent Crime
0,AL,2022,4916000,0.645,0.252,0.048,0.014,0.002,,0.039,...,88.8,28.8,217335.198947,AL,15-24,2022.0,4804.0,913.5,6430.5,7584.5
1,AL,2021,4900800,0.644,0.255,0.047,0.013,0.004,,0.037,...,87.9,27.4,193148.797902,AL,15-24,2021.0,3236.5,618.0,5552.0,5527.5
2,AL,2020,0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,88.0,27.8,169855.574269,AL,15-24,2020.0,327.0,45.0,325.0,419.5
3,AL,2019,4767100,0.654,0.265,0.044,0.014,0.004,,0.019,...,87.1,26.3,157202.180627,AL,15-24,2019.0,94.0,30.5,171.0,108.5
4,AL,2018,4752600,0.656,0.265,0.043,0.013,<.01,,0.019,...,86.6,25.5,148927.509383,AL,15-24,2018.0,33.0,15.5,37.0,52.5


In [83]:
# merge youth data
combined = pd.merge(left = combined, right = youth, how = 'left', left_on = ['state','year'], right_on = ['State Abbreviation','TimeFrame'])

print(combined.shape)
combined.head()

(357, 31)


Unnamed: 0,state,year,total_pop,white_pop,black_pop,hispanic_pop,asian_pop,native_pop,islander_pop,multi_race_pop,...,Crimes Against Society,Fraud and Other Financial Crimes,Property Crime,Violent Crime,Location,State Abbreviation,TimeFrame,Youtt not in School,Youth in Foster_care,Youth living in poverty
0,AL,2022,4916000,0.645,0.252,0.048,0.014,0.002,,0.039,...,4804.0,913.5,6430.5,7584.5,Alabama,AL,2022.0,73000.0,,240000.0
1,AL,2021,4900800,0.644,0.255,0.047,0.013,0.004,,0.037,...,3236.5,618.0,5552.0,5527.5,Alabama,AL,2021.0,86000.0,,245000.0
2,AL,2020,0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,327.0,45.0,325.0,419.5,,,,,,
3,AL,2019,4767100,0.654,0.265,0.044,0.014,0.004,,0.019,...,94.0,30.5,171.0,108.5,Alabama,AL,2019.0,74000.0,800.0,228000.0
4,AL,2018,4752600,0.656,0.265,0.043,0.013,<.01,,0.019,...,33.0,15.5,37.0,52.5,Alabama,AL,2018.0,86000.0,829.0,255000.0


In [87]:
# save combined dataframe as .csv
combined.to_csv('../data/state_demo_crime_youth_data_combined.csv')

---
## Data Cleaning

### Drop Columns

In [93]:
combined.dtypes

state                                object
year                                  int64
total_pop                             int64
white_pop                           float64
black_pop                            object
hispanic_pop                        float64
asian_pop                           float64
native_pop                           object
islander_pop                         object
multi_race_pop                      float64
median_income                         int64
poverty_rate                        float64
unemployment_rate                   float64
unemployed_15_weeks                 float64
labor_force_participation_rate      float64
hs_grad_rate                        float64
bachelors_grad_rate                 float64
zhvi                                float64
State                                object
Offender Age                         object
Data Year                           float64
Crimes Against Society              float64
Fraud and Other Financial Crimes

### Remove Nulls

In [91]:
combined.isnull().sum()

state                                 0
year                                  0
total_pop                             0
white_pop                             0
black_pop                             1
hispanic_pop                          0
asian_pop                             0
native_pop                           20
islander_pop                        177
multi_race_pop                        0
median_income                         0
poverty_rate                          0
unemployment_rate                     0
unemployed_15_weeks                   0
labor_force_participation_rate        0
hs_grad_rate                          0
bachelors_grad_rate                   0
zhvi                                  0
State                                45
Offender Age                         45
Data Year                            45
Crimes Against Society               45
Fraud and Other Financial Crimes     45
Property Crime                       45
Violent Crime                        45


X null records removed out of Y total records.