In [1]:
import sys
sys.path.append('../../../Common/Functions')
from func_Load_Data_to_Frame import *

In [2]:
import pandas as pd

df = pd.read_json('/Users/mike/Data/Public/LAPD_Crime_Data.json.zip', compression='zip')

# Evaluate the Schmema Properties

## Published Schema:
https://data.lacity.org/Public-Safety/Crime-Data-from-2020-to-Present/2nrs-mtv8/about_data

## Generated Schema in the df

In [3]:
# Show schema
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 585000 entries, 0 to 584999
Data columns (total 28 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   dr_no           585000 non-null  int64  
 1   date_rptd       585000 non-null  object 
 2   date_occ        585000 non-null  object 
 3   time_occ        585000 non-null  int64  
 4   area            585000 non-null  int64  
 5   area_name       585000 non-null  object 
 6   rpt_dist_no     585000 non-null  int64  
 7   part_1_2        585000 non-null  int64  
 8   crm_cd          585000 non-null  int64  
 9   crm_cd_desc     585000 non-null  object 
 10  mocodes         491719 non-null  object 
 11  vict_age        585000 non-null  int64  
 12  vict_sex        496186 non-null  object 
 13  vict_descent    496179 non-null  object 
 14  premis_cd       584990 non-null  float64
 15  premis_desc     584585 non-null  object 
 16  weapon_used_cd  175921 non-null  float64
 17  weapon_des

## Now Align the Data Types using the Published Schema @
https://data.lacity.org/Public-Safety/Crime-Data-from-2020-to-Present/2nrs-mtv8/about_data

In [4]:
# Convert date columns to datetime
df['date_rptd'] = pd.to_datetime(df['date_rptd'])
df['date_occ'] = pd.to_datetime(df['date_occ'])

# Convert categorical columns to category dtype
categorical_columns = ['area_name', 'crm_cd_desc', 'mocodes', 'vict_sex', 'vict_descent', 'premis_desc', 'weapon_desc', 'status', 'status_desc', 'location', 'cross_street']
for col in categorical_columns:
    df[col] = df[col].astype('category')

# Convert integer columns to int64
integer_columns = ['dr_no', 'area', 'rpt_dist_no', 'part_1_2', 'crm_cd', 'vict_age', 'premis_cd', 'weapon_used_cd', 'crm_cd_1', 'crm_cd_2', 'crm_cd_3', 'crm_cd_4']
for col in integer_columns:
    df[col] = df[col].fillna(0).astype('int64')

# Convert latitude and longitude to float64
df['lat'] = df['lat'].astype('float64')
df['lon'] = df['lon'].astype('float64')

# Verify changes
print(df.dtypes)

dr_no                      int64
date_rptd         datetime64[ns]
date_occ          datetime64[ns]
time_occ                   int64
area                       int64
area_name               category
rpt_dist_no                int64
part_1_2                   int64
crm_cd                     int64
crm_cd_desc             category
mocodes                 category
vict_age                   int64
vict_sex                category
vict_descent            category
premis_cd                  int64
premis_desc             category
weapon_used_cd             int64
weapon_desc             category
status                  category
status_desc             category
crm_cd_1                   int64
location                category
lat                      float64
lon                      float64
cross_street            category
crm_cd_2                   int64
crm_cd_3                   int64
crm_cd_4                   int64
dtype: object


# Normalize the df into Dims and Facts

In [5]:
# Create dimension tables
dim_area = (
    df[['area', 'area_name']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'area': 'fk_area'})
)

dim_crime = (
    df[['crm_cd', 'crm_cd_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'crm_cd': 'fk_crm_cd'})
)

dim_victim = (
    df[['vict_sex', 'vict_descent', 'vict_age']]
    .drop_duplicates()
    .reset_index(drop=True)
)

dim_premise = (
    df[['premis_cd', 'premis_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'premis_cd': 'fk_premis_cd'})
)

dim_weapon = (
    df[['weapon_used_cd', 'weapon_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'weapon_used_cd': 'fk_weapon_used_cd'})
)

dim_status = (
    df[['status', 'status_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'status': 'fk_status'})
)

dim_location = (
    df[['location', 'lat', 'lon', 'cross_street']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'location': 'fk_location'})
)
# Create fact table
crime_facts = df[
    [
        'dr_no', 'date_rptd', 'date_occ', 'time_occ', 
        'area', 'crm_cd', 'premis_cd', 
        'weapon_used_cd', 'status', 'location'
    ]
].copy()

# Display fact table
print("Fact Events Table:")
print(crime_facts.head())
# Display dimension tables
print("Dimension Area Table:")
print(dim_area.head())

print("\nDimension Crime Table:")
print(dim_crime.head())

print("\nDimension Victim Table:")
print(dim_victim.head())

print("\nDimension Premise Table:")
print(dim_premise.head())

print("\nDimension Weapon Table:")
print(dim_weapon.head())

print("\nDimension Status Table:")
print(dim_status.head())

print("\nDimension Location Table:")
print(dim_location.head())

Fact Events Table:
       dr_no  date_rptd   date_occ  time_occ  area  crm_cd  premis_cd  \
0  220506019 2022-02-01 2020-02-01      1200     5     627        501   
1  220805315 2022-02-01 2020-01-01      1200     8     236        502   
2  221405638 2022-02-01 2020-09-01      1425    14     354        501   
3  222105108 2022-02-01 2021-11-29      1200    21     649        501   
4  221205693 2022-02-01 2021-12-21      1800    12     236        222   

   weapon_used_cd status                                 location  
0             400     AO  1400 W  227TH                        ST  
1             400     IC  3100 S  CANFIELD                     AV  
2               0     IC   500    VENICE                       WY  
3               0     IC  5500    VALERIE                      AV  
4             400     AO  7900 S  CENTRAL                      AV  
Dimension Area Table:
   fk_area    area_name
0        5       Harbor
1        8      West LA
2       14      Pacific
3       21      

# Pickle the Dims and Facts for Downstream processing

In [6]:
import os

# Create the directory if it doesn't exist
os.makedirs('../.pickles', exist_ok=True)

# Pickle the dimension tables
dim_area.to_pickle('../.pickles/dim_area.pkl')
dim_crime.to_pickle('../.pickles/dim_crime.pkl')
dim_victim.to_pickle('../.pickles/dim_victim.pkl')
dim_premise.to_pickle('../.pickles/dim_premise.pkl')
dim_weapon.to_pickle('../.pickles/dim_weapon.pkl')
dim_status.to_pickle('../.pickles/dim_status.pkl')
dim_location.to_pickle('../.pickles/dim_location.pkl')

# Pickle the fact table
crime_facts.to_pickle('../.pickles/crime_facts.pkl')