In [20]:
import sys
sys.path.append('../../../Common/Functions')
from func_Load_Data_to_Frame import *

In [21]:
import pandas as pd

df = pd.read_json('/Users/mike/Data/Public/LAPD_Crime_Data.json.zip', compression='zip')

# Evaluate the Schmema Properties

## Published Schema:
https://data.lacity.org/Public-Safety/Crime-Data-from-2020-to-Present/2nrs-mtv8/about_data

## Generated Schema in the df

In [22]:
# Show schema
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 585000 entries, 0 to 584999
Data columns (total 28 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   dr_no           585000 non-null  int64  
 1   date_rptd       585000 non-null  object 
 2   date_occ        585000 non-null  object 
 3   time_occ        585000 non-null  int64  
 4   area            585000 non-null  int64  
 5   area_name       585000 non-null  object 
 6   rpt_dist_no     585000 non-null  int64  
 7   part_1_2        585000 non-null  int64  
 8   crm_cd          585000 non-null  int64  
 9   crm_cd_desc     585000 non-null  object 
 10  mocodes         491719 non-null  object 
 11  vict_age        585000 non-null  int64  
 12  vict_sex        496186 non-null  object 
 13  vict_descent    496179 non-null  object 
 14  premis_cd       584990 non-null  float64
 15  premis_desc     584585 non-null  object 
 16  weapon_used_cd  175921 non-null  float64
 17  weapon_des

## Now Align the Data Types using the Published Schema @
https://data.lacity.org/Public-Safety/Crime-Data-from-2020-to-Present/2nrs-mtv8/about_data

In [23]:
# Convert date columns to datetime
df['date_rptd'] = pd.to_datetime(df['date_rptd'])
df['date_occ'] = pd.to_datetime(df['date_occ'])

# Convert categorical columns to category dtype
categorical_columns = ['area_name', 'crm_cd_desc', 'mocodes', 'vict_sex', 'vict_descent', 'premis_desc', 'weapon_desc', 'status', 'status_desc', 'location', 'cross_street']
for col in categorical_columns:
    df[col] = df[col].astype('category')

# Convert integer columns to int64
integer_columns = ['dr_no', 'area', 'rpt_dist_no', 'part_1_2', 'crm_cd', 'vict_age', 'premis_cd', 'weapon_used_cd', 'crm_cd_1', 'crm_cd_2', 'crm_cd_3', 'crm_cd_4']
for col in integer_columns:
    df[col] = df[col].fillna(0).astype('int64')

# Convert latitude and longitude to float64
df['lat'] = df['lat'].astype('float64')
df['lon'] = df['lon'].astype('float64')

# Verify changes
print(df.dtypes)

dr_no                      int64
date_rptd         datetime64[ns]
date_occ          datetime64[ns]
time_occ                   int64
area                       int64
area_name               category
rpt_dist_no                int64
part_1_2                   int64
crm_cd                     int64
crm_cd_desc             category
mocodes                 category
vict_age                   int64
vict_sex                category
vict_descent            category
premis_cd                  int64
premis_desc             category
weapon_used_cd             int64
weapon_desc             category
status                  category
status_desc             category
crm_cd_1                   int64
location                category
lat                      float64
lon                      float64
cross_street            category
crm_cd_2                   int64
crm_cd_3                   int64
crm_cd_4                   int64
dtype: object


# Normalize the df into Dims and Facts

In [24]:
# Create dimension tables
dim_area = (
    df[['area', 'area_name']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'area': 'fk_area'})
)

dim_crime = (
    df[['crm_cd', 'crm_cd_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'crm_cd': 'fk_crm_cd'})
)

dim_victim = (
    df[['vict_sex', 'vict_descent', 'vict_age']]
    .drop_duplicates()
    .reset_index(drop=True)
)

dim_premise = (
    df[['premis_cd', 'premis_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'premis_cd': 'fk_premis_cd'})
)

dim_weapon = (
    df[['weapon_used_cd', 'weapon_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'weapon_used_cd': 'fk_weapon_used_cd'})
)

dim_status = (
    df[['status', 'status_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'status': 'fk_status'})
)

dim_location = (
    df[['lat', 'lon']]
    .drop_duplicates()
    .reset_index(drop=True)
)

# Add additional columns based on Schema at https://geocode.maps.co/reverse
# This data will be added downstream in the ETL process
dim_location['geo_place_id'] = None
dim_location['geo_osm_type'] = None
dim_location['geo_osm_id'] = None
dim_location['geo_display_name'] = None
dim_location['geo_road'] = None
dim_location['geo_neighbourhood'] = None
dim_location['geo_suburb'] = None
dim_location['geo_city'] = None
dim_location['geo_state'] = None
dim_location['geo_ISO3166-2-lvl4'] = None
dim_location['geo_postcode'] = None
dim_location['geo_country'] = None
dim_location['geo_country_code'] = None
dim_location['geo_boundingbox'] = None

# Create fact table
crime_facts = df[
    [
        'dr_no', 'date_rptd', 'date_occ', 'time_occ', 
        'lat', 'lon', 'location', 'cross_street', 'area', 'area_name', 'rpt_dist_no', 'premis_cd', 'premis_desc',
        'part_1_2', 'crm_cd_1', 'crm_cd_2', 'crm_cd_3', 'crm_cd_4', 'mocodes', 
        'vict_age', 'vict_sex', 'vict_descent',
        'crm_cd', 'weapon_used_cd', 'status'
    ]
].copy()

# Display fact table
print("Fact Events Table:")
print(crime_facts.head())
# Display dimension tables
print("Dimension Area Table:")
print(dim_area.head())

print("\nDimension Crime Table:")
print(dim_crime.head())

print("\nDimension Victim Table:")
print(dim_victim.head())

print("\nDimension Premise Table:")
print(dim_premise.head())

print("\nDimension Weapon Table:")
print(dim_weapon.head())

print("\nDimension Status Table:")
print(dim_status.head())

print("\nDimension Location Table:")
print(dim_location.head())

Fact Events Table:
       dr_no  date_rptd   date_occ  time_occ      lat       lon  \
0  220506019 2022-02-01 2020-02-01      1200  33.8201 -118.3015   
1  220805315 2022-02-01 2020-01-01      1200  34.0326 -118.3941   
2  221405638 2022-02-01 2020-09-01      1425  33.9875 -118.4668   
3  222105108 2022-02-01 2021-11-29      1200  34.1707 -118.6565   
4  221205693 2022-02-01 2021-12-21      1800  33.9638 -118.2629   

                                  location cross_street  area    area_name  \
0  1400 W  227TH                        ST          NaN     5       Harbor   
1  3100 S  CANFIELD                     AV          NaN     8      West LA   
2   500    VENICE                       WY          NaN    14      Pacific   
3  5500    VALERIE                      AV          NaN    21      Topanga   
4  7900 S  CENTRAL                      AV          NaN    12  77th Street   

   ...  crm_cd_2  crm_cd_3 crm_cd_4                        mocodes  vict_age  \
0  ...         0         0   

# Downstreat Postproccess - Retrieve Geo Code into Location Dim

## Load the Location Dim from the Pickle

In [25]:
import pandas as pd
import os

# Load the dimension table from the pickle file
try:
	dim_location = pd.read_pickle('../.pickles/dim_location.pkl')
	# Display the loaded DataFrame
	print(dim_location.head())
except FileNotFoundError:
	print("Pickle file not found. Please ensure the file exists.")

        lat       lon geo_place_id geo_osm_type geo_osm_id  \
0   33.8201 -118.3015    281393593          way   13356241   
6   34.0326 -118.3941    284925955          way  165791832   
7   33.9875 -118.4668    281444680          way  168954633   
11  34.1707 -118.6565    285003597          way  402526611   
12  33.9638 -118.2629    281472844          way  165899885   

                                     geo_display_name               geo_road  \
0   Halldale Avenue, Los Angeles, California, 9050...        Halldale Avenue   
6   South Canfield Avenue, Castle Heights, Los Ang...  South Canfield Avenue   
7   Venice Way, Venice Canal Historic District, Ve...             Venice Way   
11  Burbank Boulevard, Los Angeles, Los Angeles Co...      Burbank Boulevard   
12  Stanford Avenue, Florence, Los Angeles, Los An...        Stanford Avenue   

                 geo_neighbourhood geo_suburb     geo_city   geo_state  \
0                             None       None  Los Angeles  California  

## Find the Locations that need to be Processed

In [26]:
tmp_locs_to_be_processed = dim_location[dim_location['geo_place_id'].isnull()][['lat', 'lon']].drop_duplicates()
print(f"Number of records in the df: {len(tmp_locs_to_be_processed)}")

Number of records in the df: 42528


## Get the API Credentials

In [27]:
import json

# Read the credentials from the JSON file
with open('../../../Credentials/geocode_api_key.json', 'r') as f:
    api_key_data = json.load(f)

geo_api_credentials = {
    'API_Key': api_key_data['API_Key'],
    'Reverse_URL': api_key_data['Reverse_URL'],
    'Max_Lookups': api_key_data['Max_Lookups'],
    'Rate_Limit_per_Sec': api_key_data['Rate_Limit_per_Sec']
}

print(geo_api_credentials)


{'API_Key': '679e7c466cc8c453023441mst72c8af', 'Reverse_URL': 'https://geocode.maps.co/reverse', 'Max_Lookups': 5000, 'Rate_Limit_per_Sec': 1}


## Use the GeoCode API to Fetch Location Details using the Reverse Lookup from the Lat/Lon
See https://geocode.maps.co/

In [28]:
import requests
from tqdm import tqdm
import time

def Fetch_Geo_Data(api_credentials, locations):
    base_url = api_credentials['Reverse_URL']
    api_key = api_credentials['API_Key']
    max_lookups = api_credentials['Max_Lookups']
    rate_limit_per_sec = api_credentials['Rate_Limit_per_Sec']
    
    geo_data = []
    
    for index, row in tqdm(locations.iterrows(), total=locations.shape[0], desc="Fetching Geo Data"):
        if len(geo_data) >= max_lookups:
            print("Reached the maximum number of lookups for the day.")
            break
        
        params = {
            'lat': row['lat'],
            'lon': row['lon'],
            'api_key': api_key
        }
        
        attempts = 0
        while attempts < 3:
            try:
                response = requests.get(base_url, params=params)
                url = f"{base_url}?lat={row['lat']}&lon={row['lon']}&api_key={api_key}"
                response = requests.get(url)
                if response.status_code == 200:
                    geo_data.append(response.json())
                    break
                else:
                    attempts += 1
                    time.sleep(rate_limit_per_sec)
            except requests.exceptions.RequestException as e:
                print(f"Request failed: {e}")
                attempts += 1
                time.sleep(rate_limit_per_sec)
        
        if attempts == 3:
            print(f"Failed to fetch data for lat: {row['lat']}, lon: {row['lon']} after 3 attempts.")
            geo_data.append(None)
    
    return geo_data

# Example usage
geo_data = Fetch_Geo_Data(geo_api_credentials, tmp_locs_to_be_processed)
print(geo_data[:5])
# Update the dim_location DataFrame with the fetched geo data
for i, data in enumerate(geo_data):
    if data:
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_place_id'] = data.get('place_id')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_osm_type'] = data.get('osm_type')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_osm_id'] = data.get('osm_id')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_display_name'] = data.get('display_name')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_road'] = data.get('address', {}).get('road')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_neighbourhood'] = data.get('address', {}).get('neighbourhood')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_suburb'] = data.get('address', {}).get('suburb')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_city'] = data.get('address', {}).get('city')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_state'] = data.get('address', {}).get('state')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_ISO3166-2-lvl4'] = data.get('address', {}).get('ISO3166-2-lvl4')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_postcode'] = data.get('address', {}).get('postcode')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_country'] = data.get('address', {}).get('country')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_country_code'] = data.get('address', {}).get('country_code')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_boundingbox'] = data.get('boundingbox')

# Display updated dim_location DataFrame
print(dim_location.head())

Fetching Geo Data:   0%|          | 26/42528 [00:34<15:32:22,  1.32s/it]


KeyboardInterrupt: 

## Rewrite the Pickle

# Pickle the Dims and Facts for Downstream processing

In [19]:
import os

# Create the directory if it doesn't exist
os.makedirs('../.pickles', exist_ok=True)

# Pickle the dimension tables
dim_area.to_pickle('../.pickles/dim_area.pkl')
dim_crime.to_pickle('../.pickles/dim_crime.pkl')
dim_victim.to_pickle('../.pickles/dim_victim.pkl')
dim_premise.to_pickle('../.pickles/dim_premise.pkl')
dim_weapon.to_pickle('../.pickles/dim_weapon.pkl')
dim_status.to_pickle('../.pickles/dim_status.pkl')
dim_location.to_pickle('../.pickles/dim_location.pkl')

# Pickle the fact table
crime_facts.to_pickle('../.pickles/crime_facts.pkl')