## Clean National Zip-Code Data:
-read zip csv
-filter by county and state to include only Jefferson County, KY
-drop all columns except zip

In [None]:
import pandas as pd
import numpy as np

# Load the national ZIP code dataset
zip_to_county_df = pd.read_csv('../data/zip_code_database.csv')

# Filter to include only Jefferson County, KY codes
jefferson_zip_df = zip_to_county_df[(zip_to_county_df['county'] == 'Jefferson County') & (zip_to_county_df['state'] == 'KY')]

# Keep only relevant columns
jefferson_zip_df = jefferson_zip_df[['zip']]
jefferson_zip_df.reset_index(drop=True, inplace=True)
jefferson_zip_df.to_csv('../data/jefferson_zip_df.csv', index=False)
jefferson_zip_df


## Clean Metro Crime Data:
-read crime csv
-standardize 'zip_code' by removing spaces, hyphen extensions, and invalid values
-cast 'zip_code' as int and change name to 'zip'

In [None]:
import pandas as pd
import numpy as np

# Load crime data
crime_df = pd.read_csv('../data/crime_data/2024.csv')

# Clean ZIP codes, (Remove spaces, hyphen extensions, invalid values, keep only valid(5-digit) codes)
crime_df['zip_code'] = (
    crime_df['zip_code']
    .str.strip()
    .str.split('-').str[0]
    .replace(['99999', 'nan', None], np.nan)
)
crime_df = crime_df[crime_df['zip_code'].str.len() == 5]

# Convert to integer
crime_df['zip_code'] = pd.to_numeric(crime_df['zip_code'], errors='coerce').astype('Int64')
crime_df.rename(columns={'zip_code': 'zip'}, inplace=True)

# Check the result
crime_df


## Join Metro Crime Data and Zip Code Data to filter Non-Jefferson County entries

In [None]:
# Perform a left join on 'zip'
jefferson_crime_df = jefferson_zip_df.merge(crime_df, how='left', on='zip')

# Check the result
jefferson_crime_df

## Clean Metro Crime Data: Pt.2
-cast 'date_reported' as datetime
-cast 'date_occurred' as datetime
-filter for ONLY auto theft incidents
-replace '14 AUTO THEFT' data value with 'Auto Theft' for readability 
-extract only relevant data in 'offense_code_name' with regex, retaining only $xx <> $xx
-fill unknown theft values
-drop superfluous columns('block_address', 'city', 'badge_id', 'ObjectId', 'nibrs_code', 'nibrs_group_name')

In [None]:
# Cast dates as datetime
jefferson_crime_df['date_reported'] = pd.to_datetime(jefferson_crime_df['date_reported'])
jefferson_crime_df['date_occurred'] = pd.to_datetime(jefferson_crime_df['date_occurred'])

# Filter for auto theft incidents
auto_theft_df = jefferson_crime_df[jefferson_crime_df['offense_classification'] == '14 AUTO THEFT'].copy()

# Replace value '14 AUTO THEFT' for readability
auto_theft_df['offense_classification'] = auto_theft_df['offense_classification'].str.replace("14 AUTO THEFT", "Auto Theft", case=False)

# Extract monetary ranges
auto_theft_df['offense_code_name'] = auto_theft_df['offense_code_name'].str.extract(
    r'(\$[\d,]+(?: < \$[\d,]+|,000,000(?: OR MORE)?))'
)

# Fill missing values with 'Unknown Value'
auto_theft_df['offense_code_name'] = auto_theft_df['offense_code_name'].fillna('UNKNOWN VALUE')

# Rename 'offense_code_name'
auto_theft_df.rename(columns={'offense_code_name': 'value_range'}, inplace=True)

# Drop superfluous columns
columns_to_drop = ['block_address', 'city', 'badge_id', 'ObjectId', 'nibrs_code', 'nibrs_group_name']

cleaned_auto_theft_df = auto_theft_df.drop(columns=columns_to_drop)

#Export cleaned data to csv
cleaned_auto_theft_df.to_csv('../data/metro_auto_thefts.csv', index=False)

# Display the cleaned DataFrame for testing
cleaned_auto_theft_df