## Clean Metro Crime Data:
-standardize 'zip_code' by removing spaces, hyphen extensions, and invalid values
-cast 'zip_code' as int and change name to 'zip'
-cast 'date_reported' as datetime
-cast 'date_occurred' as datetime
-filter for ONLY auto theft incidents
-replace '14 AUTO THEFT' data value with 'Auto Theft' for readability 
-extract only relevant data in 'offense_code_name' with regex, retaining only $xx <> $xx
-fill unknown theft values
-drop superfluous columns('block_address', 'city', 'badge_id', 'ObjectId', 'nibrs_code', 'nibrs_group_name')

In [156]:
import pandas as pd
import numpy as np

# Load crime data
crime_df = pd.read_csv('../data/crime_data.csv')

# Clean ZIP codes, (Remove spaces, hyphen extensions, invalid values, keep only valid(5-digit) codes)
crime_df['zip_code'] = (
    crime_df['zip_code']
    .str.strip()
    .str.split('-').str[0]
    .replace(['99999', 'nan', None], np.nan)
)
crime_df = crime_df[crime_df['zip_code'].str.len() == 5]

# Convert to integer
crime_df['zip_code'] = pd.to_numeric(crime_df['zip_code'], errors='coerce').astype('Int64')
crime_df.rename(columns={'zip_code': 'zip'}, inplace=True)

# Cast dates as datetime
crime_df['date_reported'] = pd.to_datetime(crime_df['date_reported'])
crime_df['date_occurred'] = pd.to_datetime(crime_df['date_occurred'])

# Filter for auto theft incidents
auto_theft_df = crime_df[crime_df['offense_classification'] == '14 AUTO THEFT'].copy()

# Replace value '14 AUTO THEFT' for readability
auto_theft_df['offense_classification'] = auto_theft_df['offense_classification'].str.replace("14 AUTO THEFT", "Auto Theft", case=False)

# Extract monetary ranges
auto_theft_df['offense_code_name'] = auto_theft_df['offense_code_name'].str.extract(
    r'(\$[\d,]+(?: < \$[\d,]+|,000,000(?: OR MORE)?))'
)

# Fill missing values with 'Unknown Value'
auto_theft_df['offense_code_name'] = auto_theft_df['offense_code_name'].fillna('Unknown Value')

# Drop superfluous columns
columns_to_drop = ['block_address', 'city', 'badge_id', 'ObjectId', 'nibrs_code', 'nibrs_group_name']

cleaned_auto_theft_df = auto_theft_df.drop(columns=columns_to_drop)

#Export cleaned data to csv
cleaned_auto_theft_df.to_csv('../data/metro_auto_thefts.csv', index=False)

# Display the cleaned DataFrame for testing
cleaned_auto_theft_df

Unnamed: 0,incident_number,date_reported,date_occurred,offense_classification,offense_code_name,was_offense_completed,lmpd_division,lmpd_beat,location_category,zip
0,LMPD24139673,2024-12-01 13:56:00+00:00,2024-11-03 14:00:00+00:00,Auto Theft,"$1,000 < $10,000",YES,7TH DIVISION,736,HIGHWAY/ ROAD/ ALLEY/ STREET/ SIDEWALK,40219
2,LMPD24139293,2024-11-30 05:15:00+00:00,2024-11-24 18:00:00+00:00,Auto Theft,"$500 < $1,000",NO,5TH DIVISION,521,OTHER RESIDENCE (APARTMENT/CONDO),40206
4,LMPD24139173,2024-11-29 20:59:00+00:00,2024-11-08 14:00:00+00:00,Auto Theft,"$10,000 < $1,000,000",YES,2ND DIVISION,211,RESIDENCE/HOME,40212
40,LMPD24138086,2024-11-26 20:22:00+00:00,2024-11-18 05:00:00+00:00,Auto Theft,"$10,000 < $1,000,000",YES,4TH DIVISION,435,RESIDENCE/HOME,40215
43,LMPD24137938,2024-11-26 13:53:00+00:00,2024-11-23 12:00:00+00:00,Auto Theft,"$10,000 < $1,000,000",YES,6TH DIVISION,612,RESIDENCE/HOME,40213
...,...,...,...,...,...,...,...,...,...,...
62218,LMPD24000235,2024-01-01 20:58:00+00:00,2024-01-01 07:00:00+00:00,Auto Theft,"$1,000 < $10,000",YES,LYNDON,LYND,HIGHWAY/ ROAD/ ALLEY/ STREET/ SIDEWALK,40222
62320,LMPD24028484,2024-01-02 03:00:00+00:00,2024-01-02 03:00:00+00:00,Auto Theft,"$10,000 < $1,000,000",YES,2ND DIVISION,225,PARKING/ DROP LOT/ GARAGE,40210
62371,LMPD24000443,2024-01-02 02:30:00+00:00,2024-01-02 02:30:00+00:00,Auto Theft,"$10,000 < $1,000,000",YES,7TH DIVISION,724,OTHER RESIDENCE (APARTMENT/CONDO),40229
62375,LMPD24000398,2024-01-02 02:20:00+00:00,2024-01-02 02:20:00+00:00,Auto Theft,"$10,000 < $1,000,000",YES,7TH DIVISION,724,PARKING/ DROP LOT/ GARAGE,40229


## Clean National Zip-Code Data:
-read zip csv
-filter by county and state to include only Jefferson County, KY
-drop all columns except zip and county

In [None]:
# Load the national ZIP code dataset
zip_to_county_df = pd.read_csv('../data/zip_code_database.csv')

# Filter to include only Jefferson County, KY codes
jefferson_zip_df = zip_to_county_df[(zip_to_county_df['county'] == 'Jefferson County') & (zip_to_county_df['state'] == 'KY')]

# Keep only relevant columns
jefferson_zip_df = jefferson_zip_df[['zip', 'county']]
jefferson_zip_df.reset_index(drop=True, inplace=True)
jefferson_zip_df.to_csv('../data/jefferson_zip_df.csv', index=False)
jefferson_zip_df


Unnamed: 0,zip,county
0,40018,Jefferson County
1,40023,Jefferson County
2,40025,Jefferson County
3,40027,Jefferson County
4,40041,Jefferson County
...,...,...
68,40295,Jefferson County
69,40296,Jefferson County
70,40297,Jefferson County
71,40298,Jefferson County
